Nur Inna Alfianinda
Explore and predict loan classification dataset using descriptive statistics, univariate, bivariate, multivariate analysis, and machine learning algorithm.
Create a system to help predict and calculate if a person will be a loan defaulter or not automatically
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)
loans_url='https://drive.google.com/file/d/1R3n24vzom8zYfkXcUeJg44u37hjVLbX2/view?usp=sharing'
loans_url='https://drive.google.com/uc?id=' + loans_url.split('/')[-2]
df_loans = pd.read_csv(loans_url)
df_loans.head()
| Unnamed: 0 | ID | Loan Amount | Funded Amount | Funded Amount Investor | Term | Batch Enrolled | Interest Rate | Grade | Sub Grade | Employment Duration | Home Ownership | Verification Status | Payment Plan | Loan Title | Debit to Income | Delinquency - two years | Inquires - six months | Open Account | Public Record | Revolving Balance | Revolving Utilities | Total Accounts | Initial List Status | Total Received Interest | Total Received Late Fee | Recoveries | Collection Recovery Fee | Collection 12 months Medical | Application Type | Last week Pay | Accounts Delinquent | Total Collection Amount | Total Current Balance | Total Revolving Credit Limit | Loan Status | Gender | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 65087372 | 10000 | 32236 | 12329.36286 | 59 | BAT2522922 | 11.135007 | B | C4 | MORTGAGE | 176346.62670 | Not Verified | n | Debt Consolidation | 16.284758 | 1 | 0 | 13 | 0 | 24246 | 74.932551 | 7 | w | 2929.646315 | 0.102055 | 2.498291 | 0.793724 | 0 | INDIVIDUAL | 49 | 0 | 31 | 311301.0 | 6619 | 0 | Male |
| 1 | 1 | 1450153 | 3609 | 11940 | 12191.99692 | 59 | BAT1586599 | 12.237563 | C | D3 | RENT | 39833.92100 | Source Verified | n | Debt consolidation | 15.412409 | 0 | 0 | 12 | 0 | 812 | 78.297186 | 13 | f | 772.769385 | 0.036181 | 2.377215 | 0.974821 | 0 | INDIVIDUAL | 109 | 0 | 53 | 182610.0 | 20885 | 0 | Male |
| 2 | 2 | 1969101 | 28276 | 9311 | 21603.22455 | 59 | BAT2136391 | NaN | F | D4 | MORTGAGE | 91506.69105 | Source Verified | n | Debt Consolidation | 28.137619 | 0 | 0 | 14 | 0 | 1843 | 2.073040 | 20 | w | 863.324396 | 18.778660 | 4.316277 | 1.020075 | 0 | INDIVIDUAL | 66 | 0 | 34 | 89801.0 | 26155 | 0 | Male |
| 3 | 3 | 6651430 | 11170 | 6954 | 17877.15585 | 59 | BAT2428731 | 16.731201 | C | C3 | MORTGAGE | 108286.57590 | Source Verified | n | Debt consolidation | 18.043730 | 1 | 0 | 7 | 0 | 13819 | 67.467951 | 12 | w | 288.173196 | 0.044131 | 0.107020 | 0.749971 | 0 | INDIVIDUAL | 39 | 0 | 40 | 9189.0 | 60214 | 0 | Female |
| 4 | 4 | 14354669 | 16890 | 13226 | 13539.92667 | 59 | BAT5341619 | NaN | C | D4 | MORTGAGE | 44234.82545 | Source Verified | n | Credit card refinancing | 17.209886 | 1 | 3 | 13 | 1 | 1544 | 85.250761 | 22 | w | 129.239553 | 19.306646 | 1294.818751 | 0.368953 | 0 | INDIVIDUAL | 18 | 0 | 430 | 126029.0 | 22579 | 0 | Male |
df = df_loans.copy()
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 67463 entries, 0 to 67462 Data columns (total 37 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Unnamed: 0 67463 non-null int64 1 ID 67463 non-null int64 2 Loan Amount 67463 non-null int64 3 Funded Amount 67463 non-null int64 4 Funded Amount Investor 67463 non-null float64 5 Term 67463 non-null int64 6 Batch Enrolled 67463 non-null object 7 Interest Rate 60717 non-null float64 8 Grade 67463 non-null object 9 Sub Grade 60717 non-null object 10 Employment Duration 67463 non-null object 11 Home Ownership 67463 non-null float64 12 Verification Status 67463 non-null object 13 Payment Plan 67463 non-null object 14 Loan Title 67463 non-null object 15 Debit to Income 67463 non-null float64 16 Delinquency - two years 67463 non-null int64 17 Inquires - six months 67463 non-null int64 18 Open Account 67463 non-null int64 19 Public Record 67463 non-null int64 20 Revolving Balance 67463 non-null int64 21 Revolving Utilities 67463 non-null float64 22 Total Accounts 67463 non-null int64 23 Initial List Status 67463 non-null object 24 Total Received Interest 67463 non-null float64 25 Total Received Late Fee 67415 non-null float64 26 Recoveries 67463 non-null float64 27 Collection Recovery Fee 67463 non-null float64 28 Collection 12 months Medical 67463 non-null int64 29 Application Type 67463 non-null object 30 Last week Pay 67463 non-null int64 31 Accounts Delinquent 67463 non-null int64 32 Total Collection Amount 67463 non-null int64 33 Total Current Balance 67279 non-null float64 34 Total Revolving Credit Limit 67463 non-null int64 35 Loan Status 67463 non-null int64 36 Gender 67463 non-null object dtypes: float64(10), int64(17), object(10) memory usage: 19.0+ MB
m = df.shape[0]
n = df.shape[1]
print("Number of rows: " + str(m))
print("Number of columns: " + str(n))
Number of rows: 67463 Number of columns: 37
check missing values
df.isna().sum()
Unnamed: 0 0 ID 0 Loan Amount 0 Funded Amount 0 Funded Amount Investor 0 Term 0 Batch Enrolled 0 Interest Rate 6746 Grade 0 Sub Grade 6746 Employment Duration 0 Home Ownership 0 Verification Status 0 Payment Plan 0 Loan Title 0 Debit to Income 0 Delinquency - two years 0 Inquires - six months 0 Open Account 0 Public Record 0 Revolving Balance 0 Revolving Utilities 0 Total Accounts 0 Initial List Status 0 Total Received Interest 0 Total Received Late Fee 48 Recoveries 0 Collection Recovery Fee 0 Collection 12 months Medical 0 Application Type 0 Last week Pay 0 Accounts Delinquent 0 Total Collection Amount 0 Total Current Balance 184 Total Revolving Credit Limit 0 Loan Status 0 Gender 0 dtype: int64
percentage of missing values in each column
df.isna().sum() * 100 / len(df)
Unnamed: 0 0.000000 ID 0.000000 Loan Amount 0.000000 Funded Amount 0.000000 Funded Amount Investor 0.000000 Term 0.000000 Batch Enrolled 0.000000 Interest Rate 9.999555 Grade 0.000000 Sub Grade 9.999555 Employment Duration 0.000000 Home Ownership 0.000000 Verification Status 0.000000 Payment Plan 0.000000 Loan Title 0.000000 Debit to Income 0.000000 Delinquency - two years 0.000000 Inquires - six months 0.000000 Open Account 0.000000 Public Record 0.000000 Revolving Balance 0.000000 Revolving Utilities 0.000000 Total Accounts 0.000000 Initial List Status 0.000000 Total Received Interest 0.000000 Total Received Late Fee 0.071150 Recoveries 0.000000 Collection Recovery Fee 0.000000 Collection 12 months Medical 0.000000 Application Type 0.000000 Last week Pay 0.000000 Accounts Delinquent 0.000000 Total Collection Amount 0.000000 Total Current Balance 0.272742 Total Revolving Credit Limit 0.000000 Loan Status 0.000000 Gender 0.000000 dtype: float64
check for missing values in each column
missing values are in columns 'Interest Rate', 'Sub Grade', 'Total Received Late Fee', and 'Total Current Balance'
# compute corr
pd.DataFrame(df.corr()['Interest Rate'].sort_values(ascending=False))
| Interest Rate | |
|---|---|
| Interest Rate | 1.000000 |
| Revolving Balance | 0.018468 |
| Total Revolving Credit Limit | 0.016073 |
| Recoveries | 0.008875 |
| Revolving Utilities | 0.008549 |
| Inquires - six months | 0.008376 |
| Home Ownership | 0.006560 |
| Public Record | 0.005294 |
| Total Accounts | 0.005032 |
| Total Received Interest | 0.004766 |
| Funded Amount | 0.004252 |
| Total Collection Amount | 0.004063 |
| Loan Status | 0.004028 |
| ID | 0.003936 |
| Delinquency - two years | 0.002790 |
| Total Received Late Fee | 0.001749 |
| Collection Recovery Fee | -0.000273 |
| Total Current Balance | -0.001167 |
| Open Account | -0.001804 |
| Funded Amount Investor | -0.003205 |
| Loan Amount | -0.005488 |
| Unnamed: 0 | -0.006536 |
| Debit to Income | -0.008922 |
| Collection 12 months Medical | -0.009449 |
| Term | -0.011455 |
| Last week Pay | -0.012697 |
| Accounts Delinquent | NaN |
df[df['Interest Rate'].notnull()].sort_values(['Interest Rate'], ascending=True)
| Unnamed: 0 | ID | Loan Amount | Funded Amount | Funded Amount Investor | Term | Batch Enrolled | Interest Rate | Grade | Sub Grade | Employment Duration | Home Ownership | Verification Status | Payment Plan | Loan Title | Debit to Income | Delinquency - two years | Inquires - six months | Open Account | Public Record | Revolving Balance | Revolving Utilities | Total Accounts | Initial List Status | Total Received Interest | Total Received Late Fee | Recoveries | Collection Recovery Fee | Collection 12 months Medical | Application Type | Last week Pay | Accounts Delinquent | Total Collection Amount | Total Current Balance | Total Revolving Credit Limit | Loan Status | Gender | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 25496 | 25496 | 8276036 | 21204 | 10139 | 29014.612900 | 58 | BAT2803411 | 5.320006 | E | C3 | RENT | 65579.38120 | Source Verified | n | Major purchase | 18.957434 | 0 | 0 | 15 | 0 | 18244 | 87.547700 | 22 | w | 1184.237885 | 0.014528 | 5.211780 | 0.525790 | 0 | INDIVIDUAL | 42 | 0 | 8 | 33756.0 | 6677 | 0 | Female |
| 1599 | 1599 | 1627325 | 10007 | 23286 | 9799.112699 | 59 | BAT4694572 | 5.320159 | B | C1 | MORTGAGE | 56260.45169 | Source Verified | n | Credit card refinancing | 16.857748 | 0 | 0 | 13 | 0 | 150 | 59.350053 | 29 | f | 8563.047580 | 0.034949 | 6.110277 | 0.939361 | 0 | INDIVIDUAL | 10 | 0 | 13 | 35872.0 | 9087 | 0 | Female |
| 10362 | 10362 | 8772258 | 15020 | 9994 | 16524.998770 | 59 | BAT5629144 | 5.320547 | C | NaN | MORTGAGE | 48201.12762 | Source Verified | n | Credit card refinancing | 35.398384 | 0 | 0 | 7 | 0 | 3750 | 78.360657 | 26 | w | 295.638745 | 0.077426 | 0.561573 | 1.158956 | 0 | INDIVIDUAL | 9 | 0 | 43 | 117566.0 | 39532 | 0 | Male |
| 32497 | 32497 | 45814828 | 14439 | 12816 | 21863.526760 | 58 | BAT4694572 | 5.321256 | D | C2 | MORTGAGE | 61769.88240 | Not Verified | n | Debt consolidation | 14.185140 | 0 | 0 | 13 | 0 | 8208 | 43.715620 | 7 | f | 2623.784919 | 0.019818 | 4.511957 | 0.538073 | 0 | INDIVIDUAL | 44 | 0 | 48 | 85590.0 | 63566 | 0 | Female |
| 26222 | 26222 | 2179654 | 20652 | 32911 | 8706.438875 | 59 | BAT2833642 | 5.322213 | E | C3 | RENT | 127748.33110 | Source Verified | n | Debt consolidation | 25.815561 | 0 | 0 | 9 | 0 | 13606 | 95.565500 | 20 | w | 1845.782535 | 0.036066 | 5.121060 | 1.290670 | 0 | INDIVIDUAL | 132 | 0 | 26 | 165059.0 | 12805 | 0 | Female |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1319 | 1319 | 7380337 | 6027 | 22569 | 13189.526590 | 59 | BAT2252229 | 26.920449 | F | C4 | MORTGAGE | 193041.41050 | Source Verified | n | Credit card refinancing | 31.412988 | 0 | 0 | 13 | 0 | 33389 | 14.126358 | 24 | f | 1475.677628 | 0.066738 | 1.720790 | 0.894127 | 0 | INDIVIDUAL | 55 | 0 | 4 | 227173.0 | 1591 | 0 | Female |
| 19481 | 19481 | 4580036 | 29474 | 22813 | 21271.958040 | 59 | BAT4271519 | 26.932947 | E | A5 | RENT | 88773.23134 | Verified | n | Credit card refinancing | 27.805303 | 0 | 0 | 6 | 0 | 2064 | 85.747801 | 14 | f | 1347.748885 | 0.036672 | 1516.527895 | 1.262348 | 0 | INDIVIDUAL | 51 | 0 | 5 | 53015.0 | 17661 | 0 | Female |
| 19106 | 19106 | 35564226 | 5927 | 7019 | 26765.115570 | 59 | BAT2575549 | 27.018203 | B | C5 | MORTGAGE | 58174.45951 | Source Verified | n | Credit card refinancing | 16.826922 | 1 | 0 | 25 | 0 | 975 | 80.480994 | 18 | w | 1285.816986 | 0.054545 | 0.611236 | 0.803240 | 0 | INDIVIDUAL | 116 | 0 | 45 | 50786.0 | 24812 | 0 | Male |
| 9953 | 9953 | 37470677 | 5885 | 23598 | 11678.150580 | 59 | BAT1780517 | 27.070004 | C | A5 | MORTGAGE | 151977.95760 | Source Verified | n | Credit card refinancing | 14.974864 | 0 | 0 | 23 | 0 | 3618 | 53.209348 | 13 | w | 1853.679931 | 0.012509 | 0.796003 | 0.612786 | 0 | INDIVIDUAL | 57 | 0 | 1 | 268065.0 | 14444 | 0 | Male |
| 42929 | 42929 | 15928365 | 15325 | 21606 | 13071.423480 | 59 | BAT2575549 | 27.182348 | C | D1 | MORTGAGE | 65192.23315 | Verified | n | Credit card refinancing | 29.353605 | 1 | 1 | 7 | 1 | 4121 | 56.035305 | 21 | f | 325.903372 | 0.049459 | 1.852885 | 1.040410 | 0 | INDIVIDUAL | 85 | 0 | 15 | 110540.0 | 13994 | 0 | Male |
60717 rows × 37 columns
df[df['Interest Rate'].isnull()]
| Unnamed: 0 | ID | Loan Amount | Funded Amount | Funded Amount Investor | Term | Batch Enrolled | Interest Rate | Grade | Sub Grade | Employment Duration | Home Ownership | Verification Status | Payment Plan | Loan Title | Debit to Income | Delinquency - two years | Inquires - six months | Open Account | Public Record | Revolving Balance | Revolving Utilities | Total Accounts | Initial List Status | Total Received Interest | Total Received Late Fee | Recoveries | Collection Recovery Fee | Collection 12 months Medical | Application Type | Last week Pay | Accounts Delinquent | Total Collection Amount | Total Current Balance | Total Revolving Credit Limit | Loan Status | Gender | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2 | 2 | 1969101 | 28276 | 9311 | 21603.224550 | 59 | BAT2136391 | NaN | F | D4 | MORTGAGE | 91506.69105 | Source Verified | n | Debt Consolidation | 28.137619 | 0 | 0 | 14 | 0 | 1843 | 2.073040 | 20 | w | 863.324396 | 18.778660 | 4.316277 | 1.020075 | 0 | INDIVIDUAL | 66 | 0 | 34 | 89801.0 | 26155 | 0 | Male |
| 4 | 4 | 14354669 | 16890 | 13226 | 13539.926670 | 59 | BAT5341619 | NaN | C | D4 | MORTGAGE | 44234.82545 | Source Verified | n | Credit card refinancing | 17.209886 | 1 | 3 | 13 | 1 | 1544 | 85.250761 | 22 | w | 129.239553 | 19.306646 | 1294.818751 | 0.368953 | 0 | INDIVIDUAL | 18 | 0 | 430 | 126029.0 | 22579 | 0 | Male |
| 6 | 6 | 32737431 | 30844 | 19773 | 15777.511830 | 59 | BAT4808022 | NaN | C | C5 | RENT | 102391.82430 | Verified | n | Home improvement | 15.083911 | 0 | 0 | 11 | 0 | 14501 | 46.808804 | 37 | w | 525.738109 | 0.083528 | 3.167937 | 0.553076 | 0 | INDIVIDUAL | 71 | 0 | 3388 | 42069.0 | 31068 | 0 | Female |
| 8 | 8 | 4279662 | 9299 | 11238 | 13429.456610 | 59 | BAT5341619 | NaN | G | C2 | MORTGAGE | 63205.09072 | Verified | n | Credit card refinancing | 26.244710 | 0 | 0 | 6 | 0 | 549 | 15.947386 | 17 | w | 4140.198978 | 0.017106 | 0.530214 | 0.216985 | 0 | INDIVIDUAL | 144 | 0 | 26 | 68126.0 | 7482 | 0 | Male |
| 15 | 15 | 9813088 | 27859 | 33502 | 16545.203070 | 36 | BAT2833642 | NaN | B | A4 | OWN | 68062.59950 | Source Verified | n | Credit card refinancing | 17.715302 | 0 | 0 | 9 | 0 | 5904 | 44.388997 | 13 | f | 150.376663 | 0.039970 | 3.345747 | 0.627522 | 0 | INDIVIDUAL | 126 | 0 | 26 | 212092.0 | 15379 | 1 | Male |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 67407 | 67407 | 68589910 | 24478 | 21147 | 10163.525790 | 59 | BAT1184694 | NaN | B | F2 | MORTGAGE | 65806.06901 | Source Verified | n | Debt consolidation | 30.755705 | 0 | 0 | 33 | 0 | 542 | 59.884972 | 21 | w | 1829.602106 | 0.030968 | 6.179038 | 0.025178 | 0 | INDIVIDUAL | 22 | 0 | 31 | 57193.0 | 8540 | 0 | Male |
| 67429 | 67429 | 31889075 | 18812 | 4105 | 21636.681480 | 59 | BAT5629144 | NaN | A | NaN | MORTGAGE | 78382.64736 | Verified | n | Other | 28.520057 | 1 | 0 | 19 | 0 | 544 | 44.214822 | 24 | w | 443.804791 | 0.044423 | 2.984416 | 0.599350 | 0 | INDIVIDUAL | 13 | 0 | 20 | 53511.0 | 3786 | 0 | Female |
| 67442 | 67442 | 9052816 | 26689 | 12688 | 9604.161633 | 59 | BAT1780517 | NaN | A | E1 | OWN | 39836.03620 | Source Verified | n | Debt Consolidation | 31.900780 | 1 | 1 | 10 | 0 | 3615 | 51.020838 | 21 | w | 563.504024 | 0.000131 | 3.331061 | 0.993335 | 0 | INDIVIDUAL | 61 | 0 | 43 | 126503.0 | 12104 | 0 | Female |
| 67448 | 67448 | 7648951 | 9047 | 23342 | 21202.866970 | 59 | BAT224923 | NaN | B | A2 | MORTGAGE | 222728.30440 | Not Verified | n | Debt consolidation | 24.131695 | 2 | 0 | 11 | 0 | 3185 | 38.843071 | 36 | w | 225.079893 | 0.098502 | 3.134467 | 0.030521 | 0 | INDIVIDUAL | 76 | 0 | 44 | 91287.0 | 36208 | 0 | Male |
| 67452 | 67452 | 38027187 | 23118 | 7020 | 20949.525790 | 58 | BAT5341619 | NaN | C | C4 | MORTGAGE | 81967.15226 | Not Verified | n | Debt consolidation | 34.537577 | 1 | 0 | 12 | 0 | 27384 | 20.713108 | 11 | f | 1528.205851 | 0.071972 | 4.547243 | 0.257883 | 0 | INDIVIDUAL | 27 | 0 | 26 | 24532.0 | 15360 | 0 | Male |
6746 rows × 37 columns
There doesn't seem to be any relationship between 'Interest Rate' with other columns
df[df['Sub Grade'].notnull()].sort_values(['Sub Grade'], ascending=True)
| Unnamed: 0 | ID | Loan Amount | Funded Amount | Funded Amount Investor | Term | Batch Enrolled | Interest Rate | Grade | Sub Grade | Employment Duration | Home Ownership | Verification Status | Payment Plan | Loan Title | Debit to Income | Delinquency - two years | Inquires - six months | Open Account | Public Record | Revolving Balance | Revolving Utilities | Total Accounts | Initial List Status | Total Received Interest | Total Received Late Fee | Recoveries | Collection Recovery Fee | Collection 12 months Medical | Application Type | Last week Pay | Accounts Delinquent | Total Collection Amount | Total Current Balance | Total Revolving Credit Limit | Loan Status | Gender | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 55007 | 55007 | 9259418 | 18371 | 29209 | 21338.061980 | 59 | BAT3461431 | 20.288649 | A | A1 | MORTGAGE | 50305.54329 | Verified | n | Home improvement | 21.046047 | 1 | 0 | 16 | 0 | 11182 | 66.776936 | 23 | w | 2305.836105 | 0.041001 | 3.720426 | 0.432510 | 0 | INDIVIDUAL | 58 | 0 | 30 | 212521.0 | 34932 | 0 | Female |
| 25983 | 25983 | 48523121 | 2216 | 5297 | 21600.679480 | 59 | BAT1586599 | 11.275815 | F | A1 | RENT | 115079.30270 | Verified | n | Debt consolidation | 19.040177 | 0 | 0 | 20 | 1 | 2962 | 11.049092 | 9 | w | 1459.781375 | 0.037776 | 0.116777 | 0.448141 | 0 | INDIVIDUAL | 101 | 0 | 1788 | 398165.0 | 19553 | 0 | Female |
| 38174 | 38174 | 14353470 | 20219 | 30424 | 26197.998670 | 59 | BAT4351734 | 10.915192 | B | A1 | RENT | 119294.84040 | Source Verified | n | Credit card refinancing | 33.140622 | 0 | 0 | 12 | 0 | 8100 | 82.411385 | 9 | w | 2667.506605 | 0.060860 | 5.045764 | 0.660639 | 0 | INDIVIDUAL | 76 | 0 | 365 | 40338.0 | 2351 | 0 | Male |
| 42347 | 42347 | 43166610 | 29607 | 31420 | 31654.351610 | 59 | BAT1104812 | 12.830312 | B | A1 | RENT | 91293.24774 | Verified | n | Credit card refinancing | 35.319784 | 0 | 0 | 12 | 0 | 4897 | 0.905777 | 19 | w | 420.311343 | 0.003155 | 0.382167 | 0.286176 | 0 | INDIVIDUAL | 8 | 0 | 4 | 324366.0 | 22591 | 0 | Female |
| 63267 | 63267 | 10488033 | 17788 | 16702 | 13355.533220 | 59 | BAT1780517 | 16.575896 | E | A1 | RENT | 39434.75061 | Source Verified | n | Credit card refinancing | 10.897138 | 0 | 0 | 12 | 0 | 8519 | 63.756759 | 23 | f | 2314.385365 | 0.051753 | 4.852849 | 0.819126 | 0 | INDIVIDUAL | 133 | 0 | 10 | 48486.0 | 8180 | 0 | Male |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 39090 | 39090 | 51860590 | 14676 | 6779 | 6535.745203 | 59 | BAT2803411 | 10.287672 | D | G5 | RENT | 119917.83360 | Source Verified | n | Major purchase | 31.005691 | 0 | 0 | 13 | 0 | 5580 | 24.713078 | 13 | w | 1048.771672 | 0.000326 | 1.298299 | 0.942958 | 0 | INDIVIDUAL | 152 | 0 | 18 | 145245.0 | 29320 | 0 | Male |
| 21827 | 21827 | 66326802 | 6071 | 10125 | 15801.864590 | 59 | BAT2803411 | 10.064414 | C | G5 | RENT | 91658.26206 | Source Verified | n | Credit card refinancing | 30.720587 | 0 | 0 | 14 | 0 | 6974 | 94.334309 | 18 | f | 1248.632561 | 0.015755 | 1.863639 | 0.621275 | 0 | INDIVIDUAL | 103 | 0 | 18 | 54269.0 | 9559 | 0 | Male |
| 65519 | 65519 | 30984872 | 8644 | 8498 | 14779.604390 | 59 | BAT5341619 | NaN | A | G5 | MORTGAGE | 47901.82861 | Source Verified | n | Debt consolidation | 34.382785 | 1 | 0 | 31 | 0 | 15585 | 51.307726 | 15 | w | 1870.806902 | 0.046068 | 8.151471 | 1.418528 | 0 | INDIVIDUAL | 49 | 0 | 20 | 148246.0 | 85854 | 0 | Male |
| 52736 | 52736 | 3982492 | 15839 | 7994 | 10003.788920 | 58 | BAT2136391 | NaN | B | G5 | MORTGAGE | 44199.68056 | Source Verified | n | Credit card refinancing | 16.114815 | 0 | 0 | 14 | 0 | 13142 | 54.994767 | 17 | w | 1939.235098 | 0.013555 | 8.101549 | 1.320358 | 0 | INDIVIDUAL | 57 | 0 | 19 | 1987.0 | 32583 | 0 | Female |
| 38900 | 38900 | 22167914 | 17469 | 10438 | 16754.734270 | 58 | BAT5341619 | 13.165133 | B | G5 | OWN | 62455.06299 | Not Verified | n | Credit card refinancing | 33.967106 | 0 | 0 | 18 | 0 | 10987 | 30.903691 | 16 | f | 4497.513476 | 0.010655 | 9.838323 | 0.491909 | 0 | INDIVIDUAL | 94 | 0 | 1 | 231947.0 | 9319 | 0 | Male |
60717 rows × 37 columns
df[df['Sub Grade'].isnull()]
| Unnamed: 0 | ID | Loan Amount | Funded Amount | Funded Amount Investor | Term | Batch Enrolled | Interest Rate | Grade | Sub Grade | Employment Duration | Home Ownership | Verification Status | Payment Plan | Loan Title | Debit to Income | Delinquency - two years | Inquires - six months | Open Account | Public Record | Revolving Balance | Revolving Utilities | Total Accounts | Initial List Status | Total Received Interest | Total Received Late Fee | Recoveries | Collection Recovery Fee | Collection 12 months Medical | Application Type | Last week Pay | Accounts Delinquent | Total Collection Amount | Total Current Balance | Total Revolving Credit Limit | Loan Status | Gender | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 21 | 21 | 66435660 | 7466 | 9003 | 14189.439750 | 59 | BAT3873588 | 11.796877 | D | NaN | MORTGAGE | 85812.99223 | Source Verified | n | Debt consolidation | 19.506669 | 0 | 0 | 11 | 0 | 2328 | 94.327047 | 19 | f | 9948.692383 | 0.008098 | 0.188238 | 1.021255 | 0 | INDIVIDUAL | 33 | 0 | 47 | 27916.0 | 29457 | 0 | Male |
| 26 | 26 | 50701380 | 15709 | 10343 | 15668.205080 | 59 | BAT5341619 | 12.270097 | D | NaN | MORTGAGE | 172734.05460 | Source Verified | n | Debt consolidation | 11.095047 | 0 | 0 | 28 | 1 | 3052 | 45.377281 | 10 | w | 4655.823392 | 21.685850 | 4.304436 | 1.064592 | 0 | INDIVIDUAL | 103 | 0 | 16 | 56336.0 | 14619 | 0 | Female |
| 36 | 36 | 15156675 | 12765 | 6060 | 26969.038490 | 59 | BAT4136152 | 11.922914 | A | NaN | MORTGAGE | 64742.87330 | Source Verified | n | Debt consolidation | 26.560504 | 1 | 0 | 17 | 0 | 12785 | 62.879242 | 11 | w | 1030.829841 | 0.112937 | 1.126644 | 0.035603 | 0 | INDIVIDUAL | 3 | 0 | 38 | 57303.0 | 13906 | 0 | Male |
| 38 | 38 | 2161884 | 34866 | 11591 | 26043.497020 | 58 | BAT2333412 | 13.511902 | E | NaN | RENT | 294661.95100 | Source Verified | n | Debt consolidation | 13.195519 | 0 | 0 | 12 | 0 | 1384 | 51.614200 | 23 | f | 4219.232281 | 0.016011 | 4.273266 | 1.356182 | 0 | INDIVIDUAL | 32 | 0 | 36 | 345473.0 | 12965 | 0 | Female |
| 47 | 47 | 26403174 | 21447 | 9913 | 9138.181343 | 59 | BAT4694572 | 9.383149 | A | NaN | MORTGAGE | 21419.38883 | Verified | n | Debt consolidation | 14.155616 | 0 | 0 | 17 | 0 | 19586 | 75.653402 | 28 | f | 1408.691190 | 0.028742 | 2.295980 | 0.925391 | 0 | INDIVIDUAL | 50 | 0 | 20 | 94148.0 | 13700 | 0 | Male |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 67422 | 67422 | 58924146 | 14723 | 7108 | 13662.936270 | 58 | BAT4136152 | 16.215123 | F | NaN | RENT | 37906.13298 | Verified | n | Credit card refinancing | 20.111456 | 0 | 0 | 8 | 0 | 511 | 63.576874 | 28 | w | 375.836356 | 0.023708 | 6.113997 | 0.268546 | 0 | INDIVIDUAL | 142 | 0 | 8 | 125121.0 | 13875 | 0 | Female |
| 67427 | 67427 | 59234192 | 24027 | 22059 | 11084.285260 | 59 | BAT3726927 | 11.373465 | D | NaN | MORTGAGE | 84076.60589 | Not Verified | n | Debt consolidation | 21.130632 | 0 | 0 | 16 | 0 | 15977 | 83.168280 | 19 | w | 1723.949811 | 0.085734 | 1.629938 | 0.557907 | 0 | INDIVIDUAL | 4 | 0 | 49 | 21135.0 | 59007 | 1 | Male |
| 67429 | 67429 | 31889075 | 18812 | 4105 | 21636.681480 | 59 | BAT5629144 | NaN | A | NaN | MORTGAGE | 78382.64736 | Verified | n | Other | 28.520057 | 1 | 0 | 19 | 0 | 544 | 44.214822 | 24 | w | 443.804791 | 0.044423 | 2.984416 | 0.599350 | 0 | INDIVIDUAL | 13 | 0 | 20 | 53511.0 | 3786 | 0 | Female |
| 67451 | 67451 | 1746248 | 22697 | 13043 | 7275.199331 | 59 | BAT2333412 | 6.350364 | E | NaN | MORTGAGE | 53019.93218 | Source Verified | n | Credit card refinancing | 20.406832 | 0 | 0 | 13 | 0 | 29762 | 63.427205 | 14 | w | 599.066709 | 0.019909 | 1.747754 | 1.153138 | 0 | INDIVIDUAL | 102 | 0 | 9 | 292345.0 | 36613 | 0 | Male |
| 67458 | 67458 | 16164945 | 13601 | 6848 | 13175.285830 | 59 | BAT3193689 | 9.408858 | C | NaN | MORTGAGE | 83961.15003 | Verified | n | Credit card refinancing | 28.105127 | 1 | 0 | 13 | 0 | 4112 | 97.779389 | 19 | w | 1978.945960 | 0.023478 | 564.614852 | 0.865230 | 0 | INDIVIDUAL | 69 | 0 | 48 | 181775.0 | 34301 | 1 | Female |
6746 rows × 37 columns
There doesn't seem to be any relationship between 'Sub Grade' with other columns
# compute corr
pd.DataFrame(df.corr()['Total Received Late Fee'].sort_values(ascending=False))
| Total Received Late Fee | |
|---|---|
| Total Received Late Fee | 1.000000 |
| Public Record | 0.016570 |
| Total Revolving Credit Limit | 0.014836 |
| Loan Status | 0.009329 |
| Inquires - six months | 0.008291 |
| Recoveries | 0.008024 |
| Delinquency - two years | 0.007930 |
| Total Collection Amount | 0.007433 |
| ID | 0.006711 |
| Revolving Balance | 0.004953 |
| Total Accounts | 0.004928 |
| Collection Recovery Fee | 0.004875 |
| Home Ownership | 0.004006 |
| Collection 12 months Medical | 0.003568 |
| Total Received Interest | 0.002526 |
| Last week Pay | 0.001781 |
| Interest Rate | 0.001749 |
| Unnamed: 0 | 0.001609 |
| Funded Amount | 0.001532 |
| Open Account | 0.000835 |
| Loan Amount | -0.000018 |
| Funded Amount Investor | -0.000257 |
| Total Current Balance | -0.000679 |
| Term | -0.000713 |
| Revolving Utilities | -0.001344 |
| Debit to Income | -0.010239 |
| Accounts Delinquent | NaN |
df[df['Total Received Late Fee'].notnull()].sort_values(['Total Received Late Fee'], ascending=True)
| Unnamed: 0 | ID | Loan Amount | Funded Amount | Funded Amount Investor | Term | Batch Enrolled | Interest Rate | Grade | Sub Grade | Employment Duration | Home Ownership | Verification Status | Payment Plan | Loan Title | Debit to Income | Delinquency - two years | Inquires - six months | Open Account | Public Record | Revolving Balance | Revolving Utilities | Total Accounts | Initial List Status | Total Received Interest | Total Received Late Fee | Recoveries | Collection Recovery Fee | Collection 12 months Medical | Application Type | Last week Pay | Accounts Delinquent | Total Collection Amount | Total Current Balance | Total Revolving Credit Limit | Loan Status | Gender | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 18552 | 18552 | 48709136 | 21024 | 13504 | 21165.095680 | 58 | BAT224923 | 10.102512 | A | C2 | RENT | 90277.47377 | Source Verified | n | Credit card refinancing | 20.068894 | 0 | 0 | 12 | 0 | 2533 | 84.422760 | 11 | w | 2197.396700 | 0.000003 | 0.309523 | 0.619148 | 0 | INDIVIDUAL | 155 | 0 | 50 | 111477.0 | 1793 | 0 | Female |
| 2088 | 2088 | 1374566 | 15974 | 31602 | 14231.486460 | 59 | BAT2558388 | 8.125567 | A | A2 | MORTGAGE | 94038.02301 | Source Verified | n | Debt consolidation | 24.936725 | 0 | 0 | 17 | 0 | 3191 | 5.221059 | 4 | w | 518.260586 | 0.000004 | 7.607152 | 1.032296 | 0 | INDIVIDUAL | 74 | 0 | 4 | 109704.0 | 24126 | 0 | Female |
| 4963 | 4963 | 4407076 | 12929 | 11274 | 13568.058170 | 59 | BAT5811547 | 14.832581 | B | B2 | RENT | 91816.43840 | Source Verified | n | Debt consolidation | 39.058121 | 0 | 0 | 17 | 0 | 8148 | 43.593321 | 23 | w | 5486.758920 | 0.000006 | 0.553447 | 1.332853 | 0 | INDIVIDUAL | 86 | 0 | 22 | 115007.0 | 1703 | 0 | Female |
| 4001 | 4001 | 10590180 | 4517 | 18639 | 13236.023360 | 59 | BAT5629144 | 19.314768 | C | NaN | RENT | 68380.78600 | Source Verified | n | Debt consolidation | 35.797674 | 0 | 0 | 14 | 0 | 694 | 51.274415 | 36 | w | 290.204697 | 0.000013 | 5.881491 | 1.517642 | 0 | INDIVIDUAL | 25 | 0 | 43 | 119763.0 | 5015 | 0 | Male |
| 37452 | 37452 | 63140557 | 6927 | 6178 | 20502.775650 | 59 | BAT2428731 | 16.857627 | C | A5 | MORTGAGE | 179189.79830 | Not Verified | n | Credit card refinancing | 32.164313 | 0 | 0 | 18 | 0 | 3114 | 11.909360 | 17 | w | 8710.368422 | 0.000018 | 4.713503 | 1.591567 | 0 | INDIVIDUAL | 126 | 0 | 6 | 43501.0 | 4684 | 0 | Male |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 44488 | 44488 | 56617030 | 20724 | 32602 | 6587.012806 | 59 | BAT1766061 | 14.851414 | C | C5 | RENT | 84151.56226 | Source Verified | n | Debt consolidation | 26.970416 | 3 | 0 | 15 | 0 | 6791 | 95.247747 | 22 | f | 2438.155815 | 42.416569 | 1.350599 | 0.964960 | 0 | INDIVIDUAL | 10 | 0 | 5 | 38158.0 | 32226 | 1 | Male |
| 54804 | 54804 | 2153218 | 27478 | 30817 | 24684.579950 | 58 | BAT2428731 | 9.755270 | C | E2 | MORTGAGE | 179223.33310 | Verified | n | Debt consolidation | 28.784696 | 0 | 0 | 11 | 0 | 22376 | 19.805570 | 16 | w | 466.828086 | 42.449040 | 9.021348 | 0.766373 | 0 | INDIVIDUAL | 75 | 0 | 29 | 79869.0 | 4204 | 0 | Female |
| 5472 | 5472 | 25880836 | 15145 | 2989 | 30686.648540 | 58 | BAT1766061 | 10.225227 | G | A1 | RENT | 102280.44610 | Not Verified | n | Consolidate | 22.588091 | 0 | 0 | 32 | 0 | 10662 | 41.957244 | 19 | w | 296.909368 | 42.588063 | 6.423320 | 0.986690 | 0 | INDIVIDUAL | 65 | 0 | 43 | 164862.0 | 29764 | 0 | Male |
| 6157 | 6157 | 5516629 | 5738 | 7437 | 19312.842690 | 58 | BAT3873588 | 9.343276 | F | B4 | MORTGAGE | 84037.06704 | Verified | n | Credit card refinancing | 17.214565 | 0 | 0 | 4 | 0 | 3065 | 81.633519 | 8 | w | 5521.381673 | 42.595127 | 2.834230 | 0.767260 | 0 | INDIVIDUAL | 36 | 0 | 50 | 53261.0 | 35754 | 0 | Male |
| 12798 | 12798 | 5365312 | 1927 | 9274 | 11790.080510 | 59 | BAT2333412 | 17.156415 | A | E1 | MORTGAGE | 129627.99540 | Not Verified | n | Credit card refinancing | 35.356079 | 0 | 0 | 8 | 0 | 4798 | 56.608024 | 15 | f | 10816.265440 | 42.618882 | 3.974646 | 0.547648 | 0 | INDIVIDUAL | 114 | 0 | 43 | 88581.0 | 85166 | 0 | Male |
67415 rows × 37 columns
df[df['Total Received Late Fee'].isnull()].head(10)
| Unnamed: 0 | ID | Loan Amount | Funded Amount | Funded Amount Investor | Term | Batch Enrolled | Interest Rate | Grade | Sub Grade | Employment Duration | Home Ownership | Verification Status | Payment Plan | Loan Title | Debit to Income | Delinquency - two years | Inquires - six months | Open Account | Public Record | Revolving Balance | Revolving Utilities | Total Accounts | Initial List Status | Total Received Interest | Total Received Late Fee | Recoveries | Collection Recovery Fee | Collection 12 months Medical | Application Type | Last week Pay | Accounts Delinquent | Total Collection Amount | Total Current Balance | Total Revolving Credit Limit | Loan Status | Gender | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 855 | 855 | 67194717 | 7745 | 26897 | 7989.569665 | 59 | BAT1586599 | 12.908974 | E | B5 | MORTGAGE | 38040.80821 | Verified | n | Credit card refinancing | 20.637452 | 1 | 0 | 20 | 0 | 4382 | 51.555655 | 27 | w | 539.086120 | NaN | 4.921517 | 1.633589 | 0 | INDIVIDUAL | 155 | 0 | 9 | 168579.0 | 23821 | 0 | Male |
| 867 | 867 | 9869638 | 19408 | 19734 | 7024.693287 | 58 | BAT1586599 | 9.976743 | C | C3 | RENT | 155303.50110 | Not Verified | n | Credit card refinancing | 19.325631 | 0 | 1 | 16 | 0 | 41729 | 83.248951 | 29 | f | 648.956678 | NaN | 0.841670 | 0.921401 | 0 | INDIVIDUAL | 124 | 0 | 38 | 213773.0 | 48080 | 0 | Male |
| 995 | 995 | 39879780 | 20001 | 8498 | 13702.700440 | 58 | BAT5714674 | 6.826658 | B | B5 | OWN | 84449.38446 | Not Verified | n | Credit card refinancing | 17.832647 | 0 | 0 | 10 | 0 | 2908 | 56.784965 | 26 | f | 579.053645 | NaN | 5.443629 | 0.725781 | 0 | INDIVIDUAL | 75 | 0 | 42 | 177567.0 | 26716 | 0 | Male |
| 4366 | 4366 | 21892131 | 18012 | 12829 | 13014.815720 | 58 | BAT1586599 | 9.465654 | C | C4 | RENT | 39786.52228 | Not Verified | n | Credit card refinancing | 36.047951 | 0 | 0 | 10 | 0 | 2592 | 57.432326 | 17 | f | 2269.594412 | NaN | 4.166866 | 1.211923 | 0 | INDIVIDUAL | 97 | 0 | 26 | 20664.0 | 1614 | 0 | Male |
| 4529 | 4529 | 38421571 | 30498 | 21915 | 6184.010135 | 59 | BAT3873588 | NaN | E | F3 | RENT | 64199.44250 | Source Verified | n | Credit card refinancing | 14.998502 | 0 | 0 | 20 | 0 | 6297 | 60.342855 | 10 | w | 688.332049 | NaN | 1723.519525 | 0.345834 | 0 | JOINT | 69 | 0 | 28 | 121334.0 | 8009 | 0 | Joint |
| 6059 | 6059 | 1618834 | 23734 | 16322 | 19565.185080 | 59 | BAT1586599 | 16.307378 | D | C2 | RENT | 74446.31337 | Source Verified | n | Debt consolidation | 10.603572 | 0 | 0 | 12 | 0 | 2264 | 82.977771 | 17 | f | 4022.964866 | NaN | 2.488386 | 1.451959 | 0 | INDIVIDUAL | 75 | 0 | 36 | 325281.0 | 91769 | 0 | Male |
| 6063 | 6063 | 36366878 | 14712 | 21087 | 13108.222740 | 58 | BAT4136152 | 8.132547 | A | B1 | MORTGAGE | 121660.34460 | Source Verified | n | Debt consolidation | 33.957014 | 0 | 0 | 12 | 0 | 21512 | 49.278090 | 9 | w | 777.289360 | NaN | 6.051790 | 1.100933 | 0 | INDIVIDUAL | 66 | 0 | 58 | 28343.0 | 2678 | 0 | Female |
| 6196 | 6196 | 41295369 | 30490 | 9567 | 14310.072270 | 59 | BAT1780517 | 9.714841 | E | B3 | RENT | 43147.00813 | Source Verified | n | Credit card refinancing | 16.524584 | 0 | 0 | 16 | 0 | 5739 | 56.960684 | 18 | w | 2899.035050 | NaN | 1.744498 | 0.716368 | 0 | INDIVIDUAL | 2 | 0 | 50 | 60227.0 | 11943 | 0 | Male |
| 8174 | 8174 | 4635239 | 8083 | 11568 | 15561.294130 | 59 | BAT224923 | 14.663171 | D | C2 | MORTGAGE | 65621.91528 | Not Verified | n | consolidate | 36.265624 | 0 | 0 | 13 | 0 | 5252 | 88.742896 | 19 | f | 4296.743321 | NaN | 2.333218 | 1.103879 | 0 | INDIVIDUAL | 82 | 0 | 20 | 244794.0 | 17181 | 0 | Male |
| 8633 | 8633 | 48723253 | 5459 | 29825 | 15434.773680 | 59 | BAT5811547 | 12.083560 | C | D5 | RENT | 56670.86036 | Source Verified | n | Credit card refinancing | 14.608905 | 0 | 0 | 10 | 0 | 8284 | 54.554224 | 17 | f | 7845.724645 | NaN | 3.876130 | 33.634275 | 0 | INDIVIDUAL | 66 | 0 | 22 | 349956.0 | 20533 | 0 | Male |
There doesn't seem to be any relationship between 'Total Received Late Fee' with other columns
# compute corr
pd.DataFrame(df.corr()['Total Current Balance'].sort_values(ascending=False))
| Total Current Balance | |
|---|---|
| Total Current Balance | 1.000000 |
| Loan Status | 0.010033 |
| Home Ownership | 0.007042 |
| Total Collection Amount | 0.005793 |
| Total Revolving Credit Limit | 0.004708 |
| Unnamed: 0 | 0.004135 |
| Public Record | 0.003708 |
| Funded Amount Investor | 0.003444 |
| Term | 0.003268 |
| Collection Recovery Fee | 0.002791 |
| Delinquency - two years | 0.002426 |
| Inquires - six months | 0.001699 |
| Collection 12 months Medical | 0.001627 |
| Total Received Interest | 0.001500 |
| Last week Pay | 0.000986 |
| Recoveries | -0.000581 |
| Total Received Late Fee | -0.000679 |
| Interest Rate | -0.001167 |
| Funded Amount | -0.001604 |
| ID | -0.003650 |
| Total Accounts | -0.007056 |
| Open Account | -0.007245 |
| Revolving Balance | -0.007499 |
| Loan Amount | -0.008423 |
| Debit to Income | -0.011620 |
| Revolving Utilities | -0.019634 |
| Accounts Delinquent | NaN |
df[df['Total Current Balance'].notnull()].sort_values(['Total Current Balance'], ascending=True)
| Unnamed: 0 | ID | Loan Amount | Funded Amount | Funded Amount Investor | Term | Batch Enrolled | Interest Rate | Grade | Sub Grade | Employment Duration | Home Ownership | Verification Status | Payment Plan | Loan Title | Debit to Income | Delinquency - two years | Inquires - six months | Open Account | Public Record | Revolving Balance | Revolving Utilities | Total Accounts | Initial List Status | Total Received Interest | Total Received Late Fee | Recoveries | Collection Recovery Fee | Collection 12 months Medical | Application Type | Last week Pay | Accounts Delinquent | Total Collection Amount | Total Current Balance | Total Revolving Credit Limit | Loan Status | Gender | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 26696 | 26696 | 2524308 | 14811 | 15355 | 12080.901240 | 59 | BAT2558388 | 8.703669 | E | D1 | MORTGAGE | 56319.60896 | Verified | n | Credit card refinancing | 18.373409 | 0 | 0 | 13 | 0 | 1758 | 46.060281 | 21 | f | 219.457740 | 0.029738 | 5.263024 | 0.200221 | 0 | INDIVIDUAL | 148 | 0 | 11 | 617.0 | 17108 | 0 | Male |
| 20852 | 20852 | 37647411 | 8357 | 6051 | 13206.000450 | 58 | BAT2575549 | NaN | A | E5 | OWN | 82495.04274 | Source Verified | n | Debt consolidation | 28.572824 | 0 | 0 | 29 | 0 | 12580 | 69.575682 | 21 | w | 3158.025145 | 0.066509 | 4.671611 | 0.920893 | 0 | INDIVIDUAL | 115 | 0 | 55 | 623.0 | 10616 | 0 | Male |
| 38355 | 38355 | 1861838 | 4495 | 8194 | 23419.197220 | 59 | BAT2078974 | 12.056133 | B | B4 | MORTGAGE | 122886.34630 | Verified | n | Credit card refinancing | 21.743055 | 0 | 0 | 10 | 1 | 2156 | 82.261624 | 20 | w | 2134.396144 | 0.010078 | 3.145736 | 0.671325 | 1 | INDIVIDUAL | 27 | 0 | 1112 | 628.0 | 20850 | 0 | Male |
| 20516 | 20516 | 48049675 | 30675 | 10266 | 24134.503290 | 59 | BAT2522922 | 19.052820 | B | E4 | MORTGAGE | 39903.30219 | Source Verified | n | Major purchase | 36.946161 | 2 | 0 | 18 | 0 | 18783 | 21.720593 | 19 | w | 254.991001 | 0.030207 | 3.990098 | 0.221014 | 1 | INDIVIDUAL | 6 | 0 | 38 | 630.0 | 1403 | 0 | Female |
| 35030 | 35030 | 58667804 | 15741 | 9350 | 17142.053970 | 58 | BAT4136152 | 12.323640 | C | A1 | MORTGAGE | 34845.29722 | Not Verified | n | Credit card refinancing | 23.246874 | 0 | 0 | 9 | 0 | 7636 | 76.299122 | 5 | f | 13333.936360 | 0.056526 | 0.106597 | 0.154049 | 1 | INDIVIDUAL | 34 | 0 | 36 | 667.0 | 2657 | 0 | Male |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 66192 | 66192 | 25700771 | 3952 | 8126 | 10326.213480 | 59 | BAT1930365 | 8.008733 | D | D5 | OWN | 20940.19889 | Not Verified | n | Debt consolidation | 11.595069 | 0 | 0 | 9 | 0 | 2821 | 59.581288 | 27 | f | 1037.544051 | 0.031157 | 2.902742 | 1.051021 | 0 | INDIVIDUAL | 7 | 0 | 44 | 1145991.0 | 107223 | 0 | Male |
| 29507 | 29507 | 11192646 | 2487 | 23165 | 10102.405920 | 59 | BAT4271519 | 10.458313 | F | C4 | OWN | 60919.80416 | Source Verified | n | Debt consolidation | 18.372909 | 0 | 0 | 13 | 0 | 14624 | 64.896879 | 20 | f | 415.010255 | 0.090065 | 0.042053 | 0.781663 | 0 | INDIVIDUAL | 103 | 0 | 41 | 1150619.0 | 129353 | 0 | Male |
| 36387 | 36387 | 37731061 | 5979 | 12485 | 25191.067740 | 36 | BAT3873588 | 14.886599 | C | C1 | MORTGAGE | 46900.81535 | Not Verified | n | Credit card refinancing | 14.886248 | 0 | 1 | 19 | 0 | 7751 | 55.094399 | 30 | f | 621.089548 | 0.047068 | 0.081642 | 1.211684 | 0 | INDIVIDUAL | 80 | 0 | 40 | 1157944.0 | 24486 | 0 | Male |
| 50422 | 50422 | 65349973 | 14373 | 16644 | 22016.019770 | 59 | BAT2575549 | 10.897962 | B | B2 | RENT | 80913.96019 | Source Verified | n | Debt Consolidation | 21.338359 | 0 | 0 | 20 | 0 | 10353 | 22.846902 | 7 | f | 2258.905629 | 0.010657 | 1.166836 | 1.081823 | 0 | INDIVIDUAL | 86 | 0 | 8 | 1165601.0 | 17245 | 0 | Male |
| 66980 | 66980 | 11497942 | 22578 | 32674 | 8063.210334 | 58 | BAT4722912 | NaN | A | A5 | MORTGAGE | 83586.20714 | Verified | n | Credit card refinancing | 16.410404 | 2 | 0 | 12 | 1 | 6184 | 61.442051 | 24 | w | 2208.076252 | 0.076446 | 6.709537 | 1.223081 | 0 | INDIVIDUAL | 68 | 0 | 21 | 1177412.0 | 94510 | 0 | Male |
67279 rows × 37 columns
df[df['Total Current Balance'].isnull()].head(10)
| Unnamed: 0 | ID | Loan Amount | Funded Amount | Funded Amount Investor | Term | Batch Enrolled | Interest Rate | Grade | Sub Grade | Employment Duration | Home Ownership | Verification Status | Payment Plan | Loan Title | Debit to Income | Delinquency - two years | Inquires - six months | Open Account | Public Record | Revolving Balance | Revolving Utilities | Total Accounts | Initial List Status | Total Received Interest | Total Received Late Fee | Recoveries | Collection Recovery Fee | Collection 12 months Medical | Application Type | Last week Pay | Accounts Delinquent | Total Collection Amount | Total Current Balance | Total Revolving Credit Limit | Loan Status | Gender | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 313 | 313 | 7882573 | 31100 | 22692 | 6865.938722 | 59 | BAT4271519 | 8.860506 | A | NaN | OWN | 130824.32210 | Not Verified | n | Credit card refinancing | 36.029398 | 0 | 3 | 7 | 1 | 6016 | 2.135089 | 23 | w | 823.913429 | 0.084263 | 7.613259 | 0.707554 | 0 | INDIVIDUAL | 89 | 0 | 57 | NaN | 22325 | 0 | Male |
| 519 | 519 | 2738462 | 6645 | 11538 | 9797.557413 | 59 | BAT4271519 | 10.109718 | B | D2 | RENT | 85139.16312 | Not Verified | n | Credit card refinancing | 26.865370 | 0 | 0 | 9 | 0 | 3093 | 60.333171 | 26 | f | 1048.378393 | 0.004047 | 5.116557 | 1.121375 | 0 | INDIVIDUAL | 14 | 0 | 33 | NaN | 18403 | 0 | Male |
| 559 | 559 | 2415656 | 4894 | 13293 | 15375.457420 | 59 | BAT5525466 | 9.996528 | E | C4 | MORTGAGE | 48788.81351 | Verified | n | Debt Consolidation | 8.194081 | 0 | 0 | 23 | 0 | 1465 | 63.511366 | 19 | f | 3619.392433 | 0.005625 | 6.325587 | 1.371396 | 0 | INDIVIDUAL | 29 | 0 | 20 | NaN | 10379 | 0 | Male |
| 1457 | 1457 | 30607705 | 15178 | 26760 | 12747.572870 | 59 | BAT5525466 | 15.610224 | C | D2 | RENT | 35053.88525 | Verified | n | Credit card refinancing | 26.228772 | 0 | 0 | 9 | 0 | 10101 | 8.213129 | 21 | f | 240.764585 | 0.024132 | 1.921335 | 1.108562 | 0 | INDIVIDUAL | 29 | 0 | 1345 | NaN | 3942 | 0 | Male |
| 1583 | 1583 | 7933530 | 16445 | 21663 | 10198.282080 | 59 | BAT1104812 | 6.415442 | B | B5 | MORTGAGE | 129872.74320 | Verified | n | Debt consolidation | 4.713766 | 0 | 0 | 17 | 0 | 5467 | 49.632281 | 21 | w | 804.052634 | 0.102653 | 7.077454 | 0.845506 | 0 | INDIVIDUAL | 79 | 0 | 25 | NaN | 65443 | 0 | Female |
| 1592 | 1592 | 5360704 | 16815 | 9115 | 15054.620840 | 58 | BAT4351734 | 6.133730 | B | C4 | MORTGAGE | 82882.67896 | Verified | n | Credit card refinancing | 15.616988 | 0 | 0 | 12 | 0 | 15563 | 54.395493 | 13 | w | 429.716046 | 0.129120 | 4.595762 | 1.195341 | 0 | INDIVIDUAL | 97 | 0 | 30 | NaN | 46656 | 0 | Female |
| 1877 | 1877 | 18969894 | 32956 | 14472 | 3577.954796 | 58 | BAT5489674 | 11.717248 | B | D1 | MORTGAGE | 69600.56933 | Source Verified | n | Debt consolidation | 27.629691 | 1 | 0 | 12 | 0 | 10493 | 55.052380 | 28 | w | 293.212777 | 0.048511 | 1.646009 | 1.007643 | 0 | INDIVIDUAL | 24 | 0 | 54 | NaN | 4041 | 0 | Female |
| 2581 | 2581 | 62412375 | 15200 | 26681 | 31796.829240 | 58 | BAT1780517 | 6.401356 | C | NaN | RENT | 41509.04880 | Source Verified | n | Credit card refinancing | 38.053294 | 1 | 0 | 13 | 0 | 9283 | 11.120542 | 14 | w | 658.350448 | 0.036124 | 2.064368 | 0.747271 | 0 | INDIVIDUAL | 65 | 0 | 1789 | NaN | 4697 | 1 | Male |
| 2583 | 2583 | 33528486 | 27138 | 19041 | 28861.025560 | 59 | BAT1930365 | 15.028807 | D | NaN | MORTGAGE | 128938.11380 | Source Verified | n | Consolidation Loan | 5.027308 | 0 | 0 | 15 | 0 | 6092 | 41.771873 | 14 | w | 1606.748703 | 0.013861 | 8.678343 | 0.725862 | 0 | INDIVIDUAL | 125 | 0 | 51 | NaN | 9910 | 0 | Male |
| 2610 | 2610 | 6427689 | 31314 | 32947 | 11777.305120 | 58 | BAT2428731 | 12.683643 | B | B1 | RENT | 67388.70913 | Verified | n | Major purchase | 33.901671 | 0 | 0 | 9 | 0 | 15062 | 32.081908 | 9 | w | 676.880501 | 0.063161 | 6.836078 | 1.110230 | 0 | INDIVIDUAL | 9 | 0 | 33 | NaN | 15941 | 1 | Male |
There doesn't seem to be any relationship between 'Total Current Balance' with other columns
percentage of rows with missing values
df.isnull().any(axis=1).sum() * 100 /len(df)
19.268339682492627
drop columns?
Based on the data on the percentage of missing values in the column 'Interest Rate' and 'Sub Grade' the largest is around 10%. Meanwhile, the 'Total Received Late Fee' and 'Total Current Balance' columns each have a very small percentage of missing values, 0.07% and 0.27%. Therefore, column removal is not needed because the percentage of missing values is quite small (< 30%).
drop rows?
Total percentage of rows with missing values is 19.27%. Because the total missing values for all rows is more than 5%, to avoid wasting a lot of data, the missing values will be replaced with other values.
replace missing values
Type of missing data above is MCAR or Missing Completely At Random where there is no clear reasoning as to why a certain value in dataset is missing. Here we re gonna use multivariate imputation method. Multivariate imputation algorithm use the entire set of available feature dimensions to estimate the missing values. Basically, the purpose of multivariate imputation is to use other features in the dataset to predict the missing values in the current feature.
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
ndf = df[['Interest Rate','Total Received Late Fee','Total Current Balance']]
imp = IterativeImputer()
imp.fit(ndf)
df_imput = pd.DataFrame(imp.transform(ndf), columns = ndf.columns)
df_imput
| Interest Rate | Total Received Late Fee | Total Current Balance | |
|---|---|---|---|
| 0 | 11.135007 | 0.102055 | 311301.0 |
| 1 | 12.237563 | 0.036181 | 182610.0 |
| 2 | 11.850712 | 18.778660 | 89801.0 |
| 3 | 16.731201 | 0.044131 | 9189.0 |
| 4 | 11.849692 | 19.306646 | 126029.0 |
| ... | ... | ... | ... |
| 67458 | 9.408858 | 0.023478 | 181775.0 |
| 67459 | 9.972104 | 0.027095 | 22692.0 |
| 67460 | 19.650943 | 0.028212 | 176857.0 |
| 67461 | 13.169095 | 0.074508 | 361339.0 |
| 67462 | 16.034631 | 0.000671 | 196960.0 |
67463 rows × 3 columns
from sklearn.preprocessing import LabelEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
ndf_cat = df[['Sub Grade']]
srs_cat = ['Sub Grade']
ndf_cat[srs_cat] = ndf_cat[srs_cat].apply(lambda series: pd.Series(
LabelEncoder().fit_transform(series[series.notnull()]),
index=series[series.notnull()].index
))
imp_cat = IterativeImputer(estimator=RandomForestClassifier(),
initial_strategy='most_frequent',
random_state=0)
ndf_cat[srs_cat] = imp_cat.fit_transform(ndf_cat[srs_cat])
df_imput_cat = pd.DataFrame(ndf_cat)
df_imput_cat
| Sub Grade | |
|---|---|
| 0 | 13.0 |
| 1 | 17.0 |
| 2 | 18.0 |
| 3 | 12.0 |
| 4 | 18.0 |
| ... | ... |
| 67458 | 8.0 |
| 67459 | 7.0 |
| 67460 | 27.0 |
| 67461 | 22.0 |
| 67462 | 15.0 |
67463 rows × 1 columns
le = LabelEncoder()
le.fit(df['Sub Grade'].values.tolist())
list(le.classes_)
le.transform(df['Sub Grade'].values.tolist())
df_imput_cat_inv = list(le.inverse_transform(df_imput_cat.values.astype(int).tolist()))
df_imput_cat_inv = pd.DataFrame(df_imput_cat_inv, columns=['Sub Grade'])
df_imput_cat_inv
| Sub Grade | |
|---|---|
| 0 | C4 |
| 1 | D3 |
| 2 | D4 |
| 3 | C3 |
| 4 | D4 |
| ... | ... |
| 67458 | B4 |
| 67459 | B3 |
| 67460 | F3 |
| 67461 | E3 |
| 67462 | D1 |
67463 rows × 1 columns
df['Sub Grade'] = df_imput_cat_inv['Sub Grade']
df[['Interest Rate','Total Received Late Fee','Total Current Balance']] = df_imput[['Interest Rate','Total Received Late Fee','Total Current Balance']]
df.head()
| Unnamed: 0 | ID | Loan Amount | Funded Amount | Funded Amount Investor | Term | Batch Enrolled | Interest Rate | Grade | Sub Grade | Employment Duration | Home Ownership | Verification Status | Payment Plan | Loan Title | Debit to Income | Delinquency - two years | Inquires - six months | Open Account | Public Record | Revolving Balance | Revolving Utilities | Total Accounts | Initial List Status | Total Received Interest | Total Received Late Fee | Recoveries | Collection Recovery Fee | Collection 12 months Medical | Application Type | Last week Pay | Accounts Delinquent | Total Collection Amount | Total Current Balance | Total Revolving Credit Limit | Loan Status | Gender | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 65087372 | 10000 | 32236 | 12329.36286 | 59 | BAT2522922 | 11.135007 | B | C4 | MORTGAGE | 176346.62670 | Not Verified | n | Debt Consolidation | 16.284758 | 1 | 0 | 13 | 0 | 24246 | 74.932551 | 7 | w | 2929.646315 | 0.102055 | 2.498291 | 0.793724 | 0 | INDIVIDUAL | 49 | 0 | 31 | 311301.0 | 6619 | 0 | Male |
| 1 | 1 | 1450153 | 3609 | 11940 | 12191.99692 | 59 | BAT1586599 | 12.237563 | C | D3 | RENT | 39833.92100 | Source Verified | n | Debt consolidation | 15.412409 | 0 | 0 | 12 | 0 | 812 | 78.297186 | 13 | f | 772.769385 | 0.036181 | 2.377215 | 0.974821 | 0 | INDIVIDUAL | 109 | 0 | 53 | 182610.0 | 20885 | 0 | Male |
| 2 | 2 | 1969101 | 28276 | 9311 | 21603.22455 | 59 | BAT2136391 | 11.850712 | F | D4 | MORTGAGE | 91506.69105 | Source Verified | n | Debt Consolidation | 28.137619 | 0 | 0 | 14 | 0 | 1843 | 2.073040 | 20 | w | 863.324396 | 18.778660 | 4.316277 | 1.020075 | 0 | INDIVIDUAL | 66 | 0 | 34 | 89801.0 | 26155 | 0 | Male |
| 3 | 3 | 6651430 | 11170 | 6954 | 17877.15585 | 59 | BAT2428731 | 16.731201 | C | C3 | MORTGAGE | 108286.57590 | Source Verified | n | Debt consolidation | 18.043730 | 1 | 0 | 7 | 0 | 13819 | 67.467951 | 12 | w | 288.173196 | 0.044131 | 0.107020 | 0.749971 | 0 | INDIVIDUAL | 39 | 0 | 40 | 9189.0 | 60214 | 0 | Female |
| 4 | 4 | 14354669 | 16890 | 13226 | 13539.92667 | 59 | BAT5341619 | 11.849692 | C | D4 | MORTGAGE | 44234.82545 | Source Verified | n | Credit card refinancing | 17.209886 | 1 | 3 | 13 | 1 | 1544 | 85.250761 | 22 | w | 129.239553 | 19.306646 | 1294.818751 | 0.368953 | 0 | INDIVIDUAL | 18 | 0 | 430 | 126029.0 | 22579 | 0 | Male |
m = df.shape[0]
n = df.shape[1]
print("Number of rows: " + str(m))
print("Number of columns: " + str(n))
Number of rows: 67463 Number of columns: 37
re-check the number of missing values
df.isna().sum()
Unnamed: 0 0 ID 0 Loan Amount 0 Funded Amount 0 Funded Amount Investor 0 Term 0 Batch Enrolled 0 Interest Rate 0 Grade 0 Sub Grade 0 Employment Duration 0 Home Ownership 0 Verification Status 0 Payment Plan 0 Loan Title 0 Debit to Income 0 Delinquency - two years 0 Inquires - six months 0 Open Account 0 Public Record 0 Revolving Balance 0 Revolving Utilities 0 Total Accounts 0 Initial List Status 0 Total Received Interest 0 Total Received Late Fee 0 Recoveries 0 Collection Recovery Fee 0 Collection 12 months Medical 0 Application Type 0 Last week Pay 0 Accounts Delinquent 0 Total Collection Amount 0 Total Current Balance 0 Total Revolving Credit Limit 0 Loan Status 0 Gender 0 dtype: int64
Missing values already been handled, no longer exist in the data.
print("Number of duplicated data:", df.duplicated().sum())
Number of duplicated data: 0
print("Number of duplicated ID:", df['ID'].duplicated().sum())
Number of duplicated ID: 0
No duplicate values or rows in data
df.dtypes
Unnamed: 0 int64 ID int64 Loan Amount int64 Funded Amount int64 Funded Amount Investor float64 Term int64 Batch Enrolled object Interest Rate float64 Grade object Sub Grade object Employment Duration object Home Ownership float64 Verification Status object Payment Plan object Loan Title object Debit to Income float64 Delinquency - two years int64 Inquires - six months int64 Open Account int64 Public Record int64 Revolving Balance int64 Revolving Utilities float64 Total Accounts int64 Initial List Status object Total Received Interest float64 Total Received Late Fee float64 Recoveries float64 Collection Recovery Fee float64 Collection 12 months Medical int64 Application Type object Last week Pay int64 Accounts Delinquent int64 Total Collection Amount int64 Total Current Balance float64 Total Revolving Credit Limit int64 Loan Status int64 Gender object dtype: object
df.tail(3)
| Unnamed: 0 | ID | Loan Amount | Funded Amount | Funded Amount Investor | Term | Batch Enrolled | Interest Rate | Grade | Sub Grade | Employment Duration | Home Ownership | Verification Status | Payment Plan | Loan Title | Debit to Income | Delinquency - two years | Inquires - six months | Open Account | Public Record | Revolving Balance | Revolving Utilities | Total Accounts | Initial List Status | Total Received Interest | Total Received Late Fee | Recoveries | Collection Recovery Fee | Collection 12 months Medical | Application Type | Last week Pay | Accounts Delinquent | Total Collection Amount | Total Current Balance | Total Revolving Credit Limit | Loan Status | Gender | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 67460 | 67460 | 16435904 | 15897 | 32921 | 12329.45775 | 59 | BAT1761981 | 19.650943 | A | F3 | MORTGAGE | 34813.96985 | Verified | n | Lending loan | 10.295774 | 0 | 0 | 7 | 1 | 2195 | 1.500090 | 9 | w | 2691.995532 | 0.028212 | 5.673092 | 1.607093 | 0 | INDIVIDUAL | 137 | 0 | 17 | 176857.0 | 42330 | 0 | Female |
| 67461 | 67461 | 5300325 | 16567 | 4975 | 21353.68465 | 59 | BAT2333412 | 13.169095 | D | E3 | OWN | 96938.83564 | Not Verified | n | Debt consolidation | 7.614624 | 0 | 0 | 14 | 0 | 1172 | 68.481882 | 15 | f | 3659.334202 | 0.074508 | 1.157454 | 0.207608 | 0 | INDIVIDUAL | 73 | 0 | 61 | 361339.0 | 39075 | 0 | Male |
| 67462 | 67462 | 65443173 | 15353 | 29875 | 14207.44860 | 59 | BAT1930365 | 16.034631 | B | D1 | MORTGAGE | 105123.15580 | Verified | n | Debt consolidation | 16.052112 | 0 | 0 | 30 | 0 | 8762 | 81.692328 | 16 | f | 1324.255922 | 0.000671 | 1.856480 | 0.366386 | 0 | INDIVIDUAL | 54 | 0 | 47 | 196960.0 | 66060 | 0 | Male |
after checking data types and table above, all match between data type and value of each column
for i in df.select_dtypes(include=['number']):
unique_data_num = df[i].nunique()
print('total unique values column',i,':', unique_data_num)
total unique values column Unnamed: 0 : 67463 total unique values column ID : 67463 total unique values column Loan Amount : 27525 total unique values column Funded Amount : 24548 total unique values column Funded Amount Investor : 67441 total unique values column Term : 3 total unique values column Interest Rate : 67452 total unique values column Home Ownership : 67454 total unique values column Debit to Income : 67454 total unique values column Delinquency - two years : 9 total unique values column Inquires - six months : 6 total unique values column Open Account : 36 total unique values column Public Record : 5 total unique values column Revolving Balance : 20582 total unique values column Revolving Utilities : 67458 total unique values column Total Accounts : 69 total unique values column Total Received Interest : 67451 total unique values column Total Received Late Fee : 67404 total unique values column Recoveries : 67387 total unique values column Collection Recovery Fee : 67313 total unique values column Collection 12 months Medical : 2 total unique values column Last week Pay : 162 total unique values column Accounts Delinquent : 1 total unique values column Total Collection Amount : 2193 total unique values column Total Current Balance : 61039 total unique values column Total Revolving Credit Limit : 37708 total unique values column Loan Status : 2
for i in df[['Term','Delinquency - two years','Inquires - six months','Public Record','Collection 12 months Medical','Accounts Delinquent']]:
unique_data_num_min = df[i].value_counts()
print(unique_data_num_min.to_markdown(), '\n\n')
| | Term | |---:|-------:| | 59 | 43780 | | 58 | 22226 | | 36 | 1457 | | | Delinquency - two years | |---:|--------------------------:| | 0 | 52054 | | 1 | 11736 | | 2 | 2651 | | 3 | 445 | | 7 | 252 | | 6 | 191 | | 5 | 74 | | 8 | 44 | | 4 | 16 | | | Inquires - six months | |---:|------------------------:| | 0 | 60486 | | 1 | 4558 | | 2 | 2042 | | 3 | 320 | | 4 | 54 | | 5 | 3 | | | Public Record | |---:|----------------:| | 0 | 62871 | | 1 | 4133 | | 2 | 200 | | 4 | 184 | | 3 | 75 | | | Collection 12 months Medical | |---:|-------------------------------:| | 0 | 66026 | | 1 | 1437 | | | Accounts Delinquent | |---:|----------------------:| | 0 | 67463 |
Since the data in the 'Accounts Delinquent' column contains only one value, this column will be deleted.
# drop Accounts Delinquent column
df.drop(['Accounts Delinquent'], axis=1, inplace=True)
for i in df.select_dtypes(exclude=['number']):
unique_data = df[i].value_counts()
print(unique_data.to_markdown(), '\n\n')
| | Batch Enrolled | |:-----------|-----------------:| | BAT3873588 | 3626 | | BAT1586599 | 3142 | | BAT1104812 | 2996 | | BAT2252229 | 2557 | | BAT2803411 | 2425 | | BAT1780517 | 2403 | | BAT1184694 | 2298 | | BAT2078974 | 2290 | | BAT2575549 | 2257 | | BAT4694572 | 2248 | | BAT4271519 | 2054 | | BAT2558388 | 1963 | | BAT3193689 | 1864 | | BAT1930365 | 1844 | | BAT2136391 | 1790 | | BAT2333412 | 1775 | | BAT3726927 | 1774 | | BAT4136152 | 1766 | | BAT5341619 | 1717 | | BAT5525466 | 1709 | | BAT5489674 | 1677 | | BAT5629144 | 1639 | | BAT1766061 | 1461 | | BAT2833642 | 1421 | | BAT5924421 | 1404 | | BAT2522922 | 1399 | | BAT2428731 | 1398 | | BAT4808022 | 1303 | | BAT4351734 | 1140 | | BAT5547201 | 1127 | | BAT5714674 | 1105 | | BAT3461431 | 1068 | | BAT224923 | 895 | | BAT1761981 | 894 | | BAT4722912 | 887 | | BAT2003848 | 842 | | BAT1467036 | 802 | | BAT5849876 | 768 | | BAT3865626 | 728 | | BAT5811547 | 711 | | BAT1135695 | 296 | | | Grade | |:---|--------:| | C | 19085 | | B | 18742 | | A | 12055 | | D | 8259 | | E | 6446 | | F | 2246 | | G | 630 | | | Sub Grade | |:---|------------:| | B4 | 10754 | | C1 | 3775 | | B3 | 3590 | | A5 | 3180 | | B2 | 3159 | | B5 | 3056 | | D1 | 2978 | | C4 | 2932 | | C2 | 2892 | | C3 | 2796 | | B1 | 2640 | | C5 | 2250 | | A4 | 2033 | | D4 | 1824 | | D2 | 1772 | | D5 | 1760 | | D3 | 1653 | | A2 | 1624 | | E2 | 1585 | | A3 | 1535 | | A1 | 1233 | | E3 | 1206 | | E1 | 1162 | | E4 | 1009 | | F2 | 853 | | F1 | 745 | | E5 | 693 | | F3 | 530 | | F5 | 518 | | F4 | 399 | | G2 | 399 | | G1 | 326 | | G5 | 253 | | G3 | 217 | | G4 | 132 | | | Employment Duration | |:---------|----------------------:| | MORTGAGE | 36351 | | RENT | 24150 | | OWN | 6962 | | | Verification Status | |:----------------|----------------------:| | Source Verified | 33036 | | Verified | 18078 | | Not Verified | 16349 | | | Payment Plan | |:---|---------------:| | n | 67463 | | | Loan Title | |:---------------------------|-------------:| | Credit card refinancing | 30728 | | Debt consolidation | 24841 | | Debt Consolidation | 3544 | | Other | 2455 | | Home improvement | 2211 | | Major purchase | 487 | | Medical expenses | 237 | | Business | 183 | | Moving and relocation | 157 | | Car financing | 135 | | Vacation | 114 | | debt consolidation | 105 | | Consolidation | 97 | | Home buying | 94 | | Personal Loan | 78 | | Debt Consolidation Loan | 71 | | Credit Card Consolidation | 68 | | Home Improvement | 68 | | Consolidate | 55 | | Consolidation Loan | 55 | | consolidation | 51 | | payoff | 50 | | DEBT CONSOLIDATION | 46 | | Green loan | 45 | | Credit Card Debt | 45 | | Freedom | 45 | | consolidate | 42 | | Credit Card Refinance | 38 | | Credit Card Payoff | 35 | | Loan | 34 | | Credit Card | 34 | | Payoff | 33 | | consolidation loan | 33 | | Lending Club | 32 | | Credit card payoff | 32 | | Credit Cards | 32 | | Credit card refinance | 31 | | Debt | 30 | | Credit Card Loan | 30 | | Refinance | 29 | | personal | 26 | | refi | 25 | | debt consolidation loan | 25 | | Personal | 25 | | Credit Consolidation | 25 | | My Loan | 24 | | Debt Free | 22 | | Debt Payoff | 22 | | credit card | 22 | | Loan 1 | 21 | | debt | 21 | | Debt Loan | 19 | | CC consolidation | 18 | | Cards | 18 | | debt loan | 18 | | Pool | 17 | | Refinance Loan | 17 | | Credit payoff | 16 | | House | 16 | | Card Consolidation | 16 | | credit card refinance | 16 | | Pay Off | 15 | | home improvement | 15 | | MYLOAN | 15 | | CC Refinance | 15 | | Bill Consolidation | 15 | | Dept consolidation | 14 | | CC Consolidation | 14 | | Bathroom | 14 | | Consolidated | 13 | | Home | 13 | | CC Loan | 13 | | Debt Reduction | 13 | | CC-Refinance | 13 | | Home Improvement Loan | 13 | | Credit Card consolidation | 13 | | loan1 | 13 | | Get Debt Free | 13 | | Lending loan | 13 | | Bill Payoff | 12 | | Credit Card Refi | 12 | | CC Refi | 12 | | Medical | 12 | | get out of debt | 12 | | Credit card pay off | 11 | | credit card consolidation | 11 | | Credit Loan | 10 | | conso | 10 | | cards | 10 | | vacation | 10 | | pay off bills | 10 | | relief | 9 | | Credit Card Paydown | 9 | | CONSOLIDATION | 9 | | Car Loan | 9 | | CONSOLIDATE | 9 | | Medical loan | 9 | | credit pay off | 9 | | Loan Consolidation | 7 | | Debt payoff | 7 | | Credit Card Refinance Loan | 6 | | Wedding Loan | 6 | | Debt Consolidation 2013 | 6 | | CC | 6 | | Home loan | 6 | | Personal loan | 5 | | Getting Ahead | 5 | | Credit | 4 | | bills | 4 | | | Initial List Status | |:---|----------------------:| | w | 36299 | | f | 31164 | | | Application Type | |:-----------|-------------------:| | INDIVIDUAL | 67340 | | JOINT | 123 | | | Gender | |:-------|---------:| | Male | 47593 | | Female | 19747 | | Joint | 123 |
From looking at the number of unique values in the 'Batch Enrolled', 'Grade', 'Sub Grade', 'Employment Duration', 'Verification Status', 'Payment Plan', 'Initial List Status', 'Application Type', and 'Gender', no typo data was found.
Since the data in the 'Payment Plan' column contains only one value, this column will be deleted.
# drop Payment Plan column
df.drop(['Payment Plan'], axis=1, inplace=True)
check 'Loan Title Column'
print("number of unique 'Loan Title", df.select_dtypes(exclude=['number'])['Loan Title'].nunique())
number of unique 'Loan Title 109
because the number of unique values in the 'Loan Title' column is quite a lot and it is difficult to check for typo data, here a method will be used to find similar strings within one column.
!pip install rapidfuzz
Collecting rapidfuzz
Downloading rapidfuzz-2.0.7-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.6 MB)
|████████████████████████████████| 1.6 MB 5.1 MB/s
Collecting jarowinkler<1.1.0,>=1.0.2
Downloading jarowinkler-1.0.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (103 kB)
|████████████████████████████████| 103 kB 53.9 MB/s
Installing collected packages: jarowinkler, rapidfuzz
Successfully installed jarowinkler-1.0.2 rapidfuzz-2.0.7
from rapidfuzz import fuzz
# create df customer_city of unique value
unique_title = df.select_dtypes(exclude=['number'])['Loan Title'].unique().tolist()
df_title = pd.DataFrame(unique_title , columns=['unique_title'])
# check similar string
list_title = []
unique_tlt = df_title['unique_title']
for index, row1 in unique_tlt.items():
# skip elements that are already compared
for row2 in unique_tlt.iloc[index+1::]:
# use a score_cutoff to improve the runtime for bad matches
score = fuzz.ratio(row1, row2, score_cutoff=80)
if score:
list_title.append([row1, row2, score])
df_match_title = pd.DataFrame(list_title, columns=['unique_title','unique_title_match','score'])
print(df_match_title.sort_values(['unique_title']).to_markdown())
| | unique_title | unique_title_match | score | |---:|:--------------------------|:---------------------------|--------:| | 82 | Bill Consolidation | CC Consolidation | 82.3529 | | 92 | CC Consolidation | Loan Consolidation | 82.3529 | | 75 | CC Refinance | CC-Refinance | 91.6667 | | 76 | CC consolidation | consolidation | 89.6552 | | 77 | CC consolidation | CC Consolidation | 93.75 | | 91 | CONSOLIDATE | CONSOLIDATION | 83.3333 | | 58 | Card Consolidation | CC consolidation | 82.3529 | | 59 | Card Consolidation | CC Consolidation | 88.2353 | | 60 | Card Consolidation | Loan Consolidation | 83.3333 | | 65 | Consolidate | Consolidated | 95.6522 | | 56 | Consolidation | CC Consolidation | 89.6552 | | 57 | Consolidation | Loan Consolidation | 83.871 | | 54 | Consolidation | Bill Consolidation | 83.871 | | 51 | Consolidation | Consolidate | 83.3333 | | 52 | Consolidation | Consolidation Loan | 83.871 | | 53 | Consolidation | CC consolidation | 89.6552 | | 50 | Consolidation | Card Consolidation | 83.871 | | 55 | Consolidation | consolidation | 92.3077 | | 40 | Credit Card Consolidation | Card Consolidation | 83.7209 | | 41 | Credit Card Consolidation | Credit Card consolidation | 96 | | 74 | Credit Card Debt | Credit Card | 81.4815 | | 72 | Credit Card Loan | Credit Card | 81.4815 | | 73 | Credit Card Loan | Credit Loan | 81.4815 | | 84 | Credit Card Paydown | Credit Card Payoff | 86.4865 | | 43 | Credit Card Refi | Credit Card Debt | 81.25 | | 44 | Credit Card Refi | Credit Card | 81.4815 | | 45 | Credit Card Refi | Credit Card Refinance | 86.4865 | | 80 | Credit Card Refinance | Credit Card Refinance Loan | 89.3617 | | 78 | Credit Card Refinance | credit card refinance | 85.7143 | | 79 | Credit Card Refinance | Credit card refinance | 90.4762 | | 25 | Credit Cards | Credit Card | 95.6522 | | 24 | Credit Consolidation | CC Consolidation | 83.3333 | | 23 | Credit Consolidation | Credit Card consolidation | 88.8889 | | 22 | Credit Consolidation | Card Consolidation | 89.4737 | | 21 | Credit Consolidation | Credit Card Consolidation | 88.8889 | | 86 | Credit card pay off | Credit payoff | 81.25 | | 85 | Credit card pay off | Credit Card Payoff | 86.4865 | | 37 | Credit card payoff | Credit card pay off | 97.2973 | | 38 | Credit card payoff | Credit Card Payoff | 88.8889 | | 39 | Credit card payoff | Credit payoff | 83.871 | | 87 | Credit card refinance | Credit Card Refinance Loan | 80.8511 | | 15 | Credit card refinancing | Credit Card Refinance | 81.8182 | | 16 | Credit card refinancing | credit card refinance | 86.3636 | | 17 | Credit card refinancing | Credit card refinance | 90.9091 | | 26 | DEBT CONSOLIDATION | CONSOLIDATION | 83.871 | | 7 | Debt Consolidation | CC Consolidation | 82.3529 | | 6 | Debt Consolidation | Debt Consolidation 2013 | 87.8049 | | 5 | Debt Consolidation | Dept consolidation | 88.8889 | | 4 | Debt Consolidation | debt consolidation | 88.8889 | | 3 | Debt Consolidation | Consolidation | 83.871 | | 2 | Debt Consolidation | Debt Consolidation Loan | 87.8049 | | 1 | Debt Consolidation | Credit Consolidation | 84.2105 | | 0 | Debt Consolidation | Debt consolidation | 94.4444 | | 34 | Debt Consolidation Loan | Consolidation Loan | 87.8049 | | 33 | Debt Consolidation Loan | Debt Consolidation 2013 | 82.6087 | | 35 | Debt Consolidation Loan | debt consolidation loan | 86.9565 | | 93 | Debt Payoff | Debt payoff | 90.9091 | | 8 | Debt consolidation | Debt Consolidation Loan | 82.9268 | | 9 | Debt consolidation | debt consolidation | 94.4444 | | 10 | Debt consolidation | Dept consolidation | 94.4444 | | 11 | Debt consolidation | Debt Consolidation 2013 | 82.9268 | | 12 | Debt consolidation | CC consolidation | 82.3529 | | 13 | Debt consolidation | consolidation | 83.871 | | 14 | Debt consolidation | debt consolidation loan | 82.9268 | | 69 | Dept consolidation | CC consolidation | 82.3529 | | 70 | Dept consolidation | consolidation | 83.871 | | 88 | Get Debt Free | Debt Free | 81.8182 | | 28 | Home Improvement | Home Improvement Loan | 86.4865 | | 27 | Home Improvement | home improvement | 87.5 | | 18 | Home improvement | Home Improvement | 93.75 | | 19 | Home improvement | home improvement | 93.75 | | 20 | Home improvement | Home Improvement Loan | 81.0811 | | 71 | Personal | personal | 87.5 | | 46 | Personal Loan | Personal loan | 92.3077 | | 29 | Refinance | CC Refinance | 85.7143 | | 30 | Refinance | CC-Refinance | 85.7143 | | 36 | Vacation | vacation | 87.5 | | 47 | consolidate | Consolidate | 90.9091 | | 48 | consolidate | Consolidated | 86.9565 | | 49 | consolidate | consolidation | 83.3333 | | 90 | consolidation | CC Consolidation | 82.7586 | | 66 | consolidation loan | Consolidation Loan | 88.8889 | | 68 | consolidation loan | debt consolidation loan | 87.8049 | | 67 | consolidation loan | consolidation | 83.871 | | 42 | credit card | Credit Card | 81.8182 | | 32 | credit card consolidation | Credit Card consolidation | 92 | | 31 | credit card consolidation | Credit Card Consolidation | 88 | | 83 | credit card refinance | Credit card refinance | 95.2381 | | 89 | credit pay off | Credit payoff | 88.8889 | | 64 | debt consolidation | debt consolidation loan | 87.8049 | | 63 | debt consolidation | consolidation | 83.871 | | 62 | debt consolidation | CC consolidation | 82.3529 | | 61 | debt consolidation | Dept consolidation | 88.8889 | | 81 | payoff | Payoff | 83.3333 |
string transformation
unique_title_trfs = [
'payoff',
'payoff',
'payoff',
'payoff',
'payoff',
'payoff',
'Credit Card Consolidation',
'Credit Card Consolidation',
'Credit Card Consolidation',
'Credit Card Consolidation',
'Credit Card Consolidation',
'Refinance',
'Refinance',
'Refinance',
'Refinance',
'Refinance',
'Refinance',
'Refinance',
'Refinance',
'Consolidation',
'Consolidation',
'Consolidation',
'Consolidation',
'Consolidation',
'Consolidation',
'Consolidation',
'Credit Loan',
'Credit Card',
'Credit Card',
'Debt Consolidation',
'Debt Consolidation',
'Debt Consolidation',
'Debt Consolidation',
'Debt Consolidation',
'Debt Consolidation',
'Debt Consolidation',
'Debt Consolidation',
'Debt Consolidation',
'Debt Payoff',
'Debt Free',
'Home Improvement',
'Home Improvement',
'Home Improvement',
'Personal',
'Personal',
'Personal',
'Vacation',
]
unique_title_match_trfs = [
'Credit card pay off',
'Credit payoff',
'Credit Card Payoff',
'credit pay off',
'Credit card payoff',
'Payoff',
'Credit Card consolidation',
'credit card consolidation',
'CC consolidation',
'CC Consolidation',
'Credit Consolidation',
'CC-Refinance',
'Credit Card Refi',
'Credit Card Refinance',
'Credit Card Refinance Loan',
'credit card refinance',
'Credit card refinance',
'Credit card refinancing',
'CC Refinance',
'CONSOLIDATE',
'CONSOLIDATION',
'Consolidate',
'Consolidated',
'Card Consolidation',
'consolidation',
'consolidate',
'Credit Card Loan',
'Credit Cards',
'credit card',
'DEBT CONSOLIDATION',
'Debt Consolidation 2013',
'Dept consolidation',
'debt consolidation',
'Debt Consolidation Loan',
'debt consolidation loan',
'Debt consolidation',
'consolidation loan',
'Consolidation Loan',
'Debt payoff',
'Get Debt Free',
'Home Improvement Loan',
'home improvement',
'Home improvement',
'personal',
'Personal Loan',
'Personal loan',
'vacation'
]
df_transform = df.copy()
for i in range(len(unique_title_trfs)):
unique_title_trf = unique_title_trfs[i]
unique_title_match_trf = unique_title_match_trfs[i]
#check number of string that will be transformed in 'customer_city' column
print("the sum of",unique_title_trf,"before transform is",df_transform ['Loan Title'].str.contains(unique_title_trf).sum())
print("the sum of",unique_title_match_trf,"before transform is",df_transform ['Loan Title'].str.contains(unique_title_match_trf).sum(),"\n")
#replace the incorrect strings in 'customer_city' column
df_transform['Loan Title'] = df_transform['Loan Title'].str.replace(unique_title_match_trf, unique_title_trf)
#check number of string after transform in 'customer_city' column
print("the sum of",unique_title_trf,"after transform is",df_transform['Loan Title'].str.contains(unique_title_trf).sum())
print("the sum of",unique_title_match_trf,"after transform is",df_transform['Loan Title'].str.contains(unique_title_match_trf).sum(),"\n\n")
the sum of payoff before transform is 105 the sum of Credit card pay off before transform is 11 the sum of payoff after transform is 116 the sum of Credit card pay off after transform is 0 the sum of payoff before transform is 116 the sum of Credit payoff before transform is 16 the sum of payoff after transform is 116 the sum of Credit payoff after transform is 0 the sum of payoff before transform is 116 the sum of Credit Card Payoff before transform is 35 the sum of payoff after transform is 151 the sum of Credit Card Payoff after transform is 0 the sum of payoff before transform is 151 the sum of credit pay off before transform is 9 the sum of payoff after transform is 160 the sum of credit pay off after transform is 0 the sum of payoff before transform is 160 the sum of Credit card payoff before transform is 32 the sum of payoff after transform is 160 the sum of Credit card payoff after transform is 0 the sum of payoff before transform is 160 the sum of Payoff before transform is 67 the sum of payoff after transform is 227 the sum of Payoff after transform is 0 the sum of Credit Card Consolidation before transform is 68 the sum of Credit Card consolidation before transform is 13 the sum of Credit Card Consolidation after transform is 81 the sum of Credit Card consolidation after transform is 0 the sum of Credit Card Consolidation before transform is 81 the sum of credit card consolidation before transform is 11 the sum of Credit Card Consolidation after transform is 92 the sum of credit card consolidation after transform is 0 the sum of Credit Card Consolidation before transform is 92 the sum of CC consolidation before transform is 18 the sum of Credit Card Consolidation after transform is 110 the sum of CC consolidation after transform is 0 the sum of Credit Card Consolidation before transform is 110 the sum of CC Consolidation before transform is 14 the sum of Credit Card Consolidation after transform is 124 the sum of CC Consolidation after transform is 0 the sum of Credit Card Consolidation before transform is 124 the sum of Credit Consolidation before transform is 25 the sum of Credit Card Consolidation after transform is 149 the sum of Credit Consolidation after transform is 0 the sum of Refinance before transform is 118 the sum of CC-Refinance before transform is 13 the sum of Refinance after transform is 118 the sum of CC-Refinance after transform is 0 the sum of Refinance before transform is 118 the sum of Credit Card Refi before transform is 56 the sum of Refinance after transform is 130 the sum of Credit Card Refi after transform is 0 the sum of Refinance before transform is 130 the sum of Credit Card Refinance before transform is 0 the sum of Refinance after transform is 130 the sum of Credit Card Refinance after transform is 0 the sum of Refinance before transform is 130 the sum of Credit Card Refinance Loan before transform is 0 the sum of Refinance after transform is 130 the sum of Credit Card Refinance Loan after transform is 0 the sum of Refinance before transform is 130 the sum of credit card refinance before transform is 16 the sum of Refinance after transform is 146 the sum of credit card refinance after transform is 0 the sum of Refinance before transform is 146 the sum of Credit card refinance before transform is 31 the sum of Refinance after transform is 177 the sum of Credit card refinance after transform is 0 the sum of Refinance before transform is 177 the sum of Credit card refinancing before transform is 30728 the sum of Refinance after transform is 30905 the sum of Credit card refinancing after transform is 0 the sum of Refinance before transform is 30905 the sum of CC Refinance before transform is 15 the sum of Refinance after transform is 30905 the sum of CC Refinance after transform is 0 the sum of Consolidation before transform is 3960 the sum of CONSOLIDATE before transform is 9 the sum of Consolidation after transform is 3969 the sum of CONSOLIDATE after transform is 0 the sum of Consolidation before transform is 3969 the sum of CONSOLIDATION before transform is 55 the sum of Consolidation after transform is 4024 the sum of CONSOLIDATION after transform is 0 the sum of Consolidation before transform is 4024 the sum of Consolidate before transform is 68 the sum of Consolidation after transform is 4092 the sum of Consolidate after transform is 0 the sum of Consolidation before transform is 4092 the sum of Consolidated before transform is 0 the sum of Consolidation after transform is 4092 the sum of Consolidated after transform is 0 the sum of Consolidation before transform is 4092 the sum of Card Consolidation before transform is 165 the sum of Consolidation after transform is 4092 the sum of Card Consolidation after transform is 0 the sum of Consolidation before transform is 4092 the sum of consolidation before transform is 25069 the sum of Consolidation after transform is 29161 the sum of consolidation after transform is 0 the sum of Consolidation before transform is 29161 the sum of consolidate before transform is 42 the sum of Consolidation after transform is 29203 the sum of consolidate after transform is 0 the sum of Credit Loan before transform is 10 the sum of Credit Card Loan before transform is 30 the sum of Credit Loan after transform is 40 the sum of Credit Card Loan after transform is 0 the sum of Credit Card before transform is 120 the sum of Credit Cards before transform is 32 the sum of Credit Card after transform is 120 the sum of Credit Cards after transform is 0 the sum of Credit Card before transform is 120 the sum of credit card before transform is 22 the sum of Credit Card after transform is 142 the sum of credit card after transform is 0 the sum of Debt Consolidation before transform is 28462 the sum of DEBT CONSOLIDATION before transform is 0 the sum of Debt Consolidation after transform is 28462 the sum of DEBT CONSOLIDATION after transform is 0 the sum of Debt Consolidation before transform is 28462 the sum of Debt Consolidation 2013 before transform is 6 the sum of Debt Consolidation after transform is 28462 the sum of Debt Consolidation 2013 after transform is 0 the sum of Debt Consolidation before transform is 28462 the sum of Dept consolidation before transform is 0 the sum of Debt Consolidation after transform is 28462 the sum of Dept consolidation after transform is 0 the sum of Debt Consolidation before transform is 28462 the sum of debt consolidation before transform is 0 the sum of Debt Consolidation after transform is 28462 the sum of debt consolidation after transform is 0 the sum of Debt Consolidation before transform is 28462 the sum of Debt Consolidation Loan before transform is 71 the sum of Debt Consolidation after transform is 28462 the sum of Debt Consolidation Loan after transform is 0 the sum of Debt Consolidation before transform is 28462 the sum of debt consolidation loan before transform is 0 the sum of Debt Consolidation after transform is 28462 the sum of debt consolidation loan after transform is 0 the sum of Debt Consolidation before transform is 28462 the sum of Debt consolidation before transform is 0 the sum of Debt Consolidation after transform is 28462 the sum of Debt consolidation after transform is 0 the sum of Debt Consolidation before transform is 28462 the sum of consolidation loan before transform is 0 the sum of Debt Consolidation after transform is 28462 the sum of consolidation loan after transform is 0 the sum of Debt Consolidation before transform is 28462 the sum of Consolidation Loan before transform is 55 the sum of Debt Consolidation after transform is 28517 the sum of Consolidation Loan after transform is 0 the sum of Debt Payoff before transform is 0 the sum of Debt payoff before transform is 29 the sum of Debt Payoff after transform is 29 the sum of Debt payoff after transform is 0 the sum of Debt Free before transform is 35 the sum of Get Debt Free before transform is 13 the sum of Debt Free after transform is 35 the sum of Get Debt Free after transform is 0 the sum of Home Improvement before transform is 81 the sum of Home Improvement Loan before transform is 13 the sum of Home Improvement after transform is 81 the sum of Home Improvement Loan after transform is 0 the sum of Home Improvement before transform is 81 the sum of home improvement before transform is 15 the sum of Home Improvement after transform is 96 the sum of home improvement after transform is 0 the sum of Home Improvement before transform is 96 the sum of Home improvement before transform is 2211 the sum of Home Improvement after transform is 2307 the sum of Home improvement after transform is 0 the sum of Personal before transform is 108 the sum of personal before transform is 26 the sum of Personal after transform is 134 the sum of personal after transform is 0 the sum of Personal before transform is 134 the sum of Personal Loan before transform is 78 the sum of Personal after transform is 134 the sum of Personal Loan after transform is 0 the sum of Personal before transform is 134 the sum of Personal loan before transform is 5 the sum of Personal after transform is 134 the sum of Personal loan after transform is 0 the sum of Vacation before transform is 114 the sum of vacation before transform is 10 the sum of Vacation after transform is 124 the sum of vacation after transform is 0
print("number of unique 'Loan Title before cleaning typos", df.select_dtypes(exclude=['number'])['Loan Title'].nunique())
print("number of unique 'Loan Title after cleaning typos", df_transform['Loan Title'].nunique())
number of unique 'Loan Title before cleaning typos 109 number of unique 'Loan Title after cleaning typos 70
Feature Selection
df_transform.drop(['Unnamed: 0','ID'], axis=1, inplace=True)
These columns ('Unnamed: 0' and 'ID') were deleted because they would not affect the analysis. The other features will be used to see the correlation and insights so there are no too many assumptions
Label Data
label=df_transform[['Loan Status']]
label.head(3)
| Loan Status | |
|---|---|
| 0 | 0 |
| 1 | 0 |
| 2 | 0 |
Numerical Data
numerical = df_transform.drop(['Loan Status'], axis=1).select_dtypes(include=['number'])
numerical.head(3)
| Loan Amount | Funded Amount | Funded Amount Investor | Term | Interest Rate | Home Ownership | Debit to Income | Delinquency - two years | Inquires - six months | Open Account | Public Record | Revolving Balance | Revolving Utilities | Total Accounts | Total Received Interest | Total Received Late Fee | Recoveries | Collection Recovery Fee | Collection 12 months Medical | Last week Pay | Total Collection Amount | Total Current Balance | Total Revolving Credit Limit | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 10000 | 32236 | 12329.36286 | 59 | 11.135007 | 176346.62670 | 16.284758 | 1 | 0 | 13 | 0 | 24246 | 74.932551 | 7 | 2929.646315 | 0.102055 | 2.498291 | 0.793724 | 0 | 49 | 31 | 311301.0 | 6619 |
| 1 | 3609 | 11940 | 12191.99692 | 59 | 12.237563 | 39833.92100 | 15.412409 | 0 | 0 | 12 | 0 | 812 | 78.297186 | 13 | 772.769385 | 0.036181 | 2.377215 | 0.974821 | 0 | 109 | 53 | 182610.0 | 20885 |
| 2 | 28276 | 9311 | 21603.22455 | 59 | 11.850712 | 91506.69105 | 28.137619 | 0 | 0 | 14 | 0 | 1843 | 2.073040 | 20 | 863.324396 | 18.778660 | 4.316277 | 1.020075 | 0 | 66 | 34 | 89801.0 | 26155 |
Categorical Data
categorical = df_transform.select_dtypes(exclude=['number'])
categorical.head(3)
| Batch Enrolled | Grade | Sub Grade | Employment Duration | Verification Status | Loan Title | Initial List Status | Application Type | Gender | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | BAT2522922 | B | C4 | MORTGAGE | Not Verified | Debt Consolidation | w | INDIVIDUAL | Male |
| 1 | BAT1586599 | C | D3 | RENT | Source Verified | Debt Consolidation | f | INDIVIDUAL | Male |
| 2 | BAT2136391 | F | D4 | MORTGAGE | Source Verified | Debt Consolidation | w | INDIVIDUAL | Male |
from scipy import stats
def describe(df, statis):
d = df.describe()
if 'mode' in statis:
statis.remove('mode')
d = d.append(df.agg(statis))
d = d.append(df.agg(lambda x: stats.mode(x)[0]))
d = d.rename(index={0:'mode'})
else:
d = d.append(df.agg(statis))
return d
ds = describe(numerical, ['mode','var','skew','kurt'])
# add range
ds.loc['range'] = ds.loc['max'] - ds.loc['min']
# coefficient of variation (std / mean)
ds.loc['coeff of var'] = ds.loc['std'] / ds.loc['mean']
ds
| Loan Amount | Funded Amount | Funded Amount Investor | Term | Interest Rate | Home Ownership | Debit to Income | Delinquency - two years | Inquires - six months | Open Account | Public Record | Revolving Balance | Revolving Utilities | Total Accounts | Total Received Interest | Total Received Late Fee | Recoveries | Collection Recovery Fee | Collection 12 months Medical | Last week Pay | Total Collection Amount | Total Current Balance | Total Revolving Credit Limit | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 6.746300e+04 | 6.746300e+04 | 6.746300e+04 | 67463.000000 | 67463.000000 | 6.746300e+04 | 67463.000000 | 67463.000000 | 67463.000000 | 67463.000000 | 67463.000000 | 6.746300e+04 | 67463.000000 | 67463.000000 | 6.746300e+04 | 67463.000000 | 67463.000000 | 67463.000000 | 67463.000000 | 67463.000000 | 67463.000000 | 6.746300e+04 | 6.746300e+04 |
| mean | 1.684890e+04 | 1.577060e+04 | 1.462180e+04 | 58.173814 | 11.844712 | 8.054150e+04 | 23.299241 | 0.327127 | 0.145754 | 14.266561 | 0.081437 | 7.699342e+03 | 52.889443 | 18.627929 | 2.068993e+03 | 1.144750 | 59.691578 | 1.125141 | 0.021301 | 71.163260 | 146.467990 | 1.598517e+05 | 2.312301e+04 |
| std | 8.367866e+03 | 8.150993e+03 | 6.785345e+03 | 3.327441 | 3.530385 | 4.502912e+04 | 8.451824 | 0.800888 | 0.473291 | 6.225060 | 0.346606 | 7.836148e+03 | 22.539450 | 8.319246 | 2.221919e+03 | 5.244283 | 357.026346 | 3.489885 | 0.144385 | 43.315845 | 744.382233 | 1.389093e+05 | 2.091670e+04 |
| min | 1.014000e+03 | 1.014000e+03 | 1.114590e+03 | 36.000000 | 5.320006 | 1.457354e+04 | 0.675299 | 0.000000 | 0.000000 | 2.000000 | 0.000000 | 0.000000e+00 | 0.005172 | 4.000000 | 4.736746e+00 | 0.000003 | 0.000036 | 0.000036 | 0.000000 | 0.000000 | 1.000000 | 6.170000e+02 | 1.000000e+03 |
| 25% | 1.001200e+04 | 9.266500e+03 | 9.831685e+03 | 58.000000 | 9.554596 | 5.168984e+04 | 16.756416 | 0.000000 | 0.000000 | 10.000000 | 0.000000 | 2.557000e+03 | 38.658825 | 13.000000 | 5.709038e+02 | 0.021127 | 1.629818 | 0.476259 | 0.000000 | 35.000000 | 24.000000 | 5.081800e+04 | 8.155500e+03 |
| 50% | 1.607300e+04 | 1.304200e+04 | 1.279368e+04 | 59.000000 | 11.834272 | 6.933583e+04 | 22.656658 | 0.000000 | 0.000000 | 13.000000 | 0.000000 | 5.516000e+03 | 54.082334 | 18.000000 | 1.330843e+03 | 0.043427 | 3.344524 | 0.780141 | 0.000000 | 68.000000 | 36.000000 | 1.196180e+05 | 1.673300e+04 |
| 75% | 2.210600e+04 | 2.179300e+04 | 1.780759e+04 | 59.000000 | 13.778949 | 9.462332e+04 | 30.048400 | 0.000000 | 0.000000 | 16.000000 | 0.000000 | 1.018450e+04 | 69.177117 | 23.000000 | 2.656957e+03 | 0.071948 | 5.453727 | 1.070566 | 0.000000 | 105.000000 | 46.000000 | 2.283400e+05 | 3.214650e+04 |
| max | 3.500000e+04 | 3.499900e+04 | 3.499975e+04 | 59.000000 | 27.182348 | 4.065615e+05 | 39.629862 | 8.000000 | 5.000000 | 37.000000 | 4.000000 | 1.169330e+05 | 100.880050 | 72.000000 | 1.430137e+04 | 42.618882 | 4354.467419 | 166.833000 | 1.000000 | 161.000000 | 16421.000000 | 1.177412e+06 | 2.011690e+05 |
| var | 7.002118e+07 | 6.643868e+07 | 4.604091e+07 | 11.071861 | 12.463618 | 2.027622e+09 | 71.433324 | 0.641422 | 0.224005 | 38.751378 | 0.120136 | 6.140522e+07 | 508.026824 | 69.209861 | 4.936923e+06 | 27.502507 | 127467.811955 | 12.179294 | 0.020847 | 1876.262457 | 554104.908847 | 1.929580e+10 | 4.375083e+08 |
| skew | 2.880830e-01 | 6.726330e-01 | 9.901388e-01 | -6.381623 | 0.593466 | 2.130488e+00 | 0.080967 | 4.635021 | 3.711972 | 1.465107 | 6.236185 | 2.951135e+00 | -0.237245 | 0.734122 | 2.135243e+00 | 5.084308 | 7.371787 | 11.102131 | 6.631051 | 0.261989 | 12.910972 | 1.511110e+00 | 1.977150e+00 |
| kurt | -7.981367e-01 | -6.171324e-01 | 4.618679e-01 | 39.597405 | 0.497292 | 7.027734e+00 | -0.905021 | 30.676297 | 15.143928 | 1.821184 | 52.959551 | 1.690317e+01 | -0.544899 | 1.326753 | 5.187492e+00 | 25.991346 | 58.183685 | 173.326303 | 41.972076 | -0.984903 | 207.016774 | 3.134200e+00 | 5.980086e+00 |
| mode | 1.593200e+04 | 1.083500e+04 | 7.890448e+03 | 59.000000 | 6.191126 | 2.713967e+04 | 17.625069 | 0.000000 | 0.000000 | 12.000000 | 0.000000 | 1.394000e+03 | 9.409247 | 18.000000 | 4.379250e+02 | 0.000065 | 0.231616 | 1.125373 | 0.000000 | 14.000000 | 39.000000 | 3.626800e+04 | 5.310000e+03 |
| range | 3.398600e+04 | 3.398500e+04 | 3.388516e+04 | 23.000000 | 21.862342 | 3.919880e+05 | 38.954563 | 8.000000 | 5.000000 | 35.000000 | 4.000000 | 1.169330e+05 | 100.874877 | 68.000000 | 1.429663e+04 | 42.618879 | 4354.467383 | 166.832964 | 1.000000 | 161.000000 | 16420.000000 | 1.176795e+06 | 2.001690e+05 |
| coeff of var | 4.966416e-01 | 5.168474e-01 | 4.640568e-01 | 0.057198 | 0.298056 | 5.590797e-01 | 0.362751 | 2.448246 | 3.247193 | 0.436339 | 4.256109 | 1.017768e+00 | 0.426162 | 0.446601 | 1.073913e+00 | 4.581160 | 5.981185 | 3.101731 | 6.778480 | 0.608683 | 5.082218 | 8.689887e-01 | 9.045840e-01 |
categorical.describe()
| Batch Enrolled | Grade | Sub Grade | Employment Duration | Verification Status | Loan Title | Initial List Status | Application Type | Gender | |
|---|---|---|---|---|---|---|---|---|---|
| count | 67463 | 67463 | 67463 | 67463 | 67463 | 67463 | 67463 | 67463 | 67463 |
| unique | 41 | 7 | 35 | 3 | 3 | 70 | 2 | 2 | 3 |
| top | BAT3873588 | C | B4 | MORTGAGE | Source Verified | Refinance | w | INDIVIDUAL | Male |
| freq | 3626 | 19085 | 10754 | 36351 | 33036 | 30844 | 36299 | 67340 | 47593 |
label.value_counts()
Loan Status 0 61222 1 6241 dtype: int64
label_s = label.iloc[:,0]
# plot
ax = sns.countplot(label_s, palette='RdBu')
# data points
tampung = []
for x in ax.patches:
tampung.append(x.get_height())
for x in ax.patches:
# ax.annotate(f'{x.get_height():0.0f}', (x.get_x() + x.get_width() / 2., x.get_height()), ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points')
ax.annotate(f'{100 * x.get_height() / sum(tampung):0.2f}%', (x.get_x() + x.get_width() / 2., x.get_height()), ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points')
# settings
plt.xlabel('Loan Status', fontsize = 13, labelpad = 20)
plt.ylabel('Frequency', fontsize = 13, labelpad = 20)
plt.title("Number of Representative that Defaulter (1) and Non Defaulter (0) are Imbalanced", fontsize = 16, pad = 30)
sns.despine(top=True, right=True, left=False, bottom=False)
ax.spines['left'].set_color('lightgray')
ax.spines['bottom'].set_color('lightgray')
The graph above shows that loan status categories that are 'Defaulter' and 'Non Defaulter' have an imbalanced representative frequency
len(numerical.columns)
23
plt.figure(figsize=(20,27))
for i in range(0, len(numerical.columns)):
ax = plt.subplot(8,3,i+1)
sns.boxplot(numerical[numerical.columns[i]], color='lightblue')
plt.tight_layout()
plt.figure(figsize=(20,27))
for i in range(0, len(numerical.columns)):
ax = plt.subplot(8,3,i+1)
sns.distplot(numerical[numerical.columns[i]], color='blue')
plt.tight_layout()
plt.figure(figsize=(10,5))
# plot
ax = sns.countplot(x="Grade", data=df_transform, hue='Loan Status', palette="Blues");
sns.despine(top=True, right=True, left=False, bottom=False);
ax.spines['left'].set_color('lightgray');
ax.spines['bottom'].set_color('lightgray');
# data points
tampung = []
for x in ax.patches:
tampung.append(x.get_height())
for x in ax.patches:
# ax.annotate(f'{x.get_height():0.0f}', (x.get_x() + x.get_width() / 2., x.get_height()), ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points')
ax.annotate(f'{100 * x.get_height() / sum(tampung):0.2f}%', (x.get_x() + x.get_width() / 2. + 0.05, x.get_height()), ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points')
# settings
ax.set_ylabel('Frequency');
ax.set_title('Grades A and B Most Affect Non-Defaulter Status', fontsize = 14, pad = 50);
plt.figure(figsize=(10,5))
# plot
ax = sns.countplot(x="Employment Duration", data=df_transform, hue='Loan Status', palette="Blues");
sns.despine(top=True, right=True, left=False, bottom=False);
ax.spines['left'].set_color('lightgray');
ax.spines['bottom'].set_color('lightgray');
# data points
tampung = []
for x in ax.patches:
tampung.append(x.get_height())
for x in ax.patches:
# ax.annotate(f'{x.get_height():0.0f}', (x.get_x() + x.get_width() / 2., x.get_height()), ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points')
ax.annotate(f'{100 * x.get_height() / sum(tampung):0.2f}%', (x.get_x() + x.get_width() / 2. + 0.05, x.get_height()), ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points')
# settings
ax.set_ylabel('Frequency');
ax.set_title('There is No Significant Difference on Loan Status Ratio Based on Employment Duration', fontsize = 14, pad = 50);
plt.figure(figsize=(10,5))
# plot
ax = sns.countplot(x="Verification Status", data=df_transform, hue='Loan Status', palette="Blues");
sns.despine(top=True, right=True, left=False, bottom=False);
ax.spines['left'].set_color('lightgray');
ax.spines['bottom'].set_color('lightgray');
# data points
tampung = []
for x in ax.patches:
tampung.append(x.get_height())
for x in ax.patches:
# ax.annotate(f'{x.get_height():0.0f}', (x.get_x() + x.get_width() / 2., x.get_height()), ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points')
ax.annotate(f'{100 * x.get_height() / sum(tampung):0.2f}%', (x.get_x() + x.get_width() / 2. + 0.05, x.get_height()), ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points')
# settings
ax.set_ylabel('Frequency');
ax.set_title('Status Verified Most Affect Non-Defaulter Status', fontsize = 14, pad = 50);
plt.figure(figsize=(10,5))
# plot
ax = sns.countplot(x="Initial List Status", data=df_transform, hue='Loan Status', palette="Blues");
sns.despine(top=True, right=True, left=False, bottom=False);
ax.spines['left'].set_color('lightgray');
ax.spines['bottom'].set_color('lightgray');
# data points
tampung = []
for x in ax.patches:
tampung.append(x.get_height())
for x in ax.patches:
# ax.annotate(f'{x.get_height():0.0f}', (x.get_x() + x.get_width() / 2., x.get_height()), ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points')
ax.annotate(f'{100 * x.get_height() / sum(tampung):0.2f}%', (x.get_x() + x.get_width() / 2. + 0.05, x.get_height()), ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points')
# settings
ax.set_ylabel('Frequency');
ax.set_title('Initial List Status of W Most Affect Non-Defaulter Status', fontsize = 14, pad = 50);
plt.figure(figsize=(10,5))
# plot
ax = sns.countplot(x="Application Type", data=df_transform, hue='Loan Status', palette="Blues");
sns.despine(top=True, right=True, left=False, bottom=False);
ax.spines['left'].set_color('lightgray');
ax.spines['bottom'].set_color('lightgray');
# data points
tampung = []
for x in ax.patches:
tampung.append(x.get_height())
for x in ax.patches:
# ax.annotate(f'{x.get_height():0.0f}', (x.get_x() + x.get_width() / 2., x.get_height()), ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points')
ax.annotate(f'{100 * x.get_height() / sum(tampung):0.2f}%', (x.get_x() + x.get_width() / 2. + 0.05, x.get_height()), ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points')
# settings
ax.set_ylabel('Frequency');
ax.set_title('Application Type of Joint Most Affect Non-Defaulter Status', fontsize = 14, pad = 50);
plt.figure(figsize=(10,5))
# plot
ax = sns.countplot(x="Gender", data=df_transform, hue='Loan Status', palette="Blues");
sns.despine(top=True, right=True, left=False, bottom=False);
ax.spines['left'].set_color('lightgray');
ax.spines['bottom'].set_color('lightgray');
# data points
tampung = []
for x in ax.patches:
tampung.append(x.get_height())
for x in ax.patches:
# ax.annotate(f'{x.get_height():0.0f}', (x.get_x() + x.get_width() / 2., x.get_height()), ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points')
ax.annotate(f'{100 * x.get_height() / sum(tampung):0.2f}%', (x.get_x() + x.get_width() / 2. + 0.05, x.get_height()), ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points')
# settings
ax.set_ylabel('Frequency');
ax.set_title('Gender Joint Most Affect Non-Defaulter Status', fontsize = 14, pad = 50);
from scipy.stats import pearsonr
def reg_coef(x,y,label=None,color=None,**kwargs):
ax = plt.gca()
r,p = pearsonr(x,y)
ax.annotate('r = {:.2f}'.format(r), xy=(0.5,0.5), xycoords='axes fraction', ha='center', size=20)
ax.set_axis_off()
g = sns.PairGrid(data=numerical)
g.map_diag(sns.distplot)
g.map_lower(sns.regplot)
g.map_upper(reg_coef);
There are clear clusters shown on graph:
sns.pairplot(data=df_transform, hue = "Loan Status", palette = "RdBu");